import/pdf: Read text from all pages

This commit is contained in:
polyfloyd 2025-06-29 23:53:21 +02:00
parent fe53249bff
commit 5344c5b978
2 changed files with 17 additions and 29 deletions

View file

@ -1,20 +1,16 @@
from pypdf import PdfReader
from beangulp import mimetypes, Importer
from beangulp.cache import cache
import re import re
from datetime import date from datetime import date
from beancount.core.data import Transaction, Posting, Document
from beancount.core import flags from beancount.core import amount, data, flags
from beancount.core import data from beancount.core.data import Document, Posting, Transaction
from beancount.core import amount
from beancount.core.number import D from beancount.core.number import D
from beangulp import Importer, mimetypes
from pypdf import PdfReader
@cache def pdf_lines(filename):
def pdf_to_text(filename):
reader = PdfReader(filename) reader = PdfReader(filename)
page1 = reader.pages[0] return [line for page in reader.pages for line in page.extract_text().split("\n")]
return page1.extract_text()
class MollieInvoiceImporter(Importer): class MollieInvoiceImporter(Importer):
@ -23,15 +19,13 @@ class MollieInvoiceImporter(Importer):
if mimetype != "application/pdf": if mimetype != "application/pdf":
return False return False
lines = pdf_to_text(filepath).split("\n") return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath))
return any(line.startswith("Mollie B.V.") for line in lines)
def account(self, filepath): def account(self, filepath):
return "Uitgaven:Bankkosten" return "Uitgaven:Bankkosten"
def tx_ref(self, filepath): def tx_ref(self, filepath):
lines = pdf_to_text(filepath).split("\n") for line in pdf_lines(filepath):
for line in lines:
if m := re.search(r"^Invoice reference (MOL-.+)$", line): if m := re.search(r"^Invoice reference (MOL-.+)$", line):
return m[1] return m[1]
raise Exception("Mollie invoice reference not found") raise Exception("Mollie invoice reference not found")
@ -40,15 +34,13 @@ class MollieInvoiceImporter(Importer):
return f"{self.tx_ref(filepath)}.pdf" return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath): def date(self, filepath):
lines = pdf_to_text(filepath).split("\n") for line in pdf_lines(filepath):
for line in lines:
if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line): if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1])) return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Mollie invoice date not found") raise Exception("Mollie invoice date not found")
def extract(self, filepath, existing): def extract(self, filepath, existing):
lines = pdf_to_text(filepath).split("\n") for line in pdf_lines(filepath):
for line in lines:
if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line): if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
total = D(m[1]) total = D(m[1])
break break
@ -92,15 +84,15 @@ class StatiegeldImporter(Importer):
if mimetype != "application/pdf": if mimetype != "application/pdf":
return False return False
lines = pdf_to_text(filepath).split("\n") return any(
return any(line.startswith("Statiegeld Nederland") for line in lines) line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath)
)
def account(self, filepath): def account(self, filepath):
return "Inkomsten:Statiegeld" return "Inkomsten:Statiegeld"
def tx_ref(self, filepath): def tx_ref(self, filepath):
lines = pdf_to_text(filepath).split("\n") for line in pdf_lines(filepath):
for line in lines:
if m := re.search(r"^Factuurnr : (RP\d+)$", line): if m := re.search(r"^Factuurnr : (RP\d+)$", line):
return f"SNL-{m[1]}" return f"SNL-{m[1]}"
raise Exception("Mollie invoice reference not found") raise Exception("Mollie invoice reference not found")
@ -109,8 +101,7 @@ class StatiegeldImporter(Importer):
return f"{self.tx_ref(filepath)}.pdf" return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath): def date(self, filepath):
lines = pdf_to_text(filepath).split("\n") for line in pdf_lines(filepath):
for line in lines:
if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line): if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1])) return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Date not found") raise Exception("Date not found")

View file

@ -22,12 +22,9 @@ set_cachier_params(
) )
@cachier()
def pdf_lines(filename): def pdf_lines(filename):
reader = PdfReader(filename) reader = PdfReader(filename)
return list( return [line for page in reader.pages for line in page.extract_text().split("\n")]
line for page in reader.pages for line in page.extract_text().split("\n")
)
@cachier() @cachier()