From 5344c5b978d24ad8fa04c749af5ae4457b586274 Mon Sep 17 00:00:00 2001 From: polyfloyd Date: Sun, 29 Jun 2025 23:53:21 +0200 Subject: [PATCH] import/pdf: Read text from all pages --- import/pdf.py | 41 +++++++++++++++------------------------ scripts/import-generic.py | 5 +---- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/import/pdf.py b/import/pdf.py index 051d8fc..cd35968 100644 --- a/import/pdf.py +++ b/import/pdf.py @@ -1,20 +1,16 @@ -from pypdf import PdfReader -from beangulp import mimetypes, Importer -from beangulp.cache import cache import re from datetime import date -from beancount.core.data import Transaction, Posting, Document -from beancount.core import flags -from beancount.core import data -from beancount.core import amount + +from beancount.core import amount, data, flags +from beancount.core.data import Document, Posting, Transaction from beancount.core.number import D +from beangulp import Importer, mimetypes +from pypdf import PdfReader -@cache -def pdf_to_text(filename): +def pdf_lines(filename): reader = PdfReader(filename) - page1 = reader.pages[0] - return page1.extract_text() + return [line for page in reader.pages for line in page.extract_text().split("\n")] class MollieInvoiceImporter(Importer): @@ -23,15 +19,13 @@ class MollieInvoiceImporter(Importer): if mimetype != "application/pdf": return False - lines = pdf_to_text(filepath).split("\n") - return any(line.startswith("Mollie B.V.") for line in lines) + return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath)) def account(self, filepath): return "Uitgaven:Bankkosten" def tx_ref(self, filepath): - lines = pdf_to_text(filepath).split("\n") - for line in lines: + for line in pdf_lines(filepath): if m := re.search(r"^Invoice reference (MOL-.+)$", line): return m[1] raise Exception("Mollie invoice reference not found") @@ -40,15 +34,13 @@ class MollieInvoiceImporter(Importer): return f"{self.tx_ref(filepath)}.pdf" def date(self, filepath): - lines = pdf_to_text(filepath).split("\n") - for line in lines: + for line in pdf_lines(filepath): if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line): return date(int(m[3]), int(m[2]), int(m[1])) raise Exception("Mollie invoice date not found") def extract(self, filepath, existing): - lines = pdf_to_text(filepath).split("\n") - for line in lines: + for line in pdf_lines(filepath): if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line): total = D(m[1]) break @@ -92,15 +84,15 @@ class StatiegeldImporter(Importer): if mimetype != "application/pdf": return False - lines = pdf_to_text(filepath).split("\n") - return any(line.startswith("Statiegeld Nederland") for line in lines) + return any( + line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath) + ) def account(self, filepath): return "Inkomsten:Statiegeld" def tx_ref(self, filepath): - lines = pdf_to_text(filepath).split("\n") - for line in lines: + for line in pdf_lines(filepath): if m := re.search(r"^Factuurnr : (RP\d+)$", line): return f"SNL-{m[1]}" raise Exception("Mollie invoice reference not found") @@ -109,8 +101,7 @@ class StatiegeldImporter(Importer): return f"{self.tx_ref(filepath)}.pdf" def date(self, filepath): - lines = pdf_to_text(filepath).split("\n") - for line in lines: + for line in pdf_lines(filepath): if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line): return date(int(m[3]), int(m[2]), int(m[1])) raise Exception("Date not found") diff --git a/scripts/import-generic.py b/scripts/import-generic.py index 1507d61..47e3a6b 100755 --- a/scripts/import-generic.py +++ b/scripts/import-generic.py @@ -22,12 +22,9 @@ set_cachier_params( ) -@cachier() def pdf_lines(filename): reader = PdfReader(filename) - return list( - line for page in reader.pages for line in page.extract_text().split("\n") - ) + return [line for page in reader.pages for line in page.extract_text().split("\n")] @cachier()