import/pdf: Read text from all pages

This commit is contained in:
polyfloyd 2025-06-29 23:53:21 +02:00
parent fe53249bff
commit 5344c5b978
2 changed files with 17 additions and 29 deletions

View file

@ -1,20 +1,16 @@
from pypdf import PdfReader
from beangulp import mimetypes, Importer
from beangulp.cache import cache
import re
from datetime import date
from beancount.core.data import Transaction, Posting, Document
from beancount.core import flags
from beancount.core import data
from beancount.core import amount
from beancount.core import amount, data, flags
from beancount.core.data import Document, Posting, Transaction
from beancount.core.number import D
from beangulp import Importer, mimetypes
from pypdf import PdfReader
@cache
def pdf_to_text(filename):
def pdf_lines(filename):
reader = PdfReader(filename)
page1 = reader.pages[0]
return page1.extract_text()
return [line for page in reader.pages for line in page.extract_text().split("\n")]
class MollieInvoiceImporter(Importer):
@ -23,15 +19,13 @@ class MollieInvoiceImporter(Importer):
if mimetype != "application/pdf":
return False
lines = pdf_to_text(filepath).split("\n")
return any(line.startswith("Mollie B.V.") for line in lines)
return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath))
def account(self, filepath):
return "Uitgaven:Bankkosten"
def tx_ref(self, filepath):
lines = pdf_to_text(filepath).split("\n")
for line in lines:
for line in pdf_lines(filepath):
if m := re.search(r"^Invoice reference (MOL-.+)$", line):
return m[1]
raise Exception("Mollie invoice reference not found")
@ -40,15 +34,13 @@ class MollieInvoiceImporter(Importer):
return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath):
lines = pdf_to_text(filepath).split("\n")
for line in lines:
for line in pdf_lines(filepath):
if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Mollie invoice date not found")
def extract(self, filepath, existing):
lines = pdf_to_text(filepath).split("\n")
for line in lines:
for line in pdf_lines(filepath):
if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
total = D(m[1])
break
@ -92,15 +84,15 @@ class StatiegeldImporter(Importer):
if mimetype != "application/pdf":
return False
lines = pdf_to_text(filepath).split("\n")
return any(line.startswith("Statiegeld Nederland") for line in lines)
return any(
line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath)
)
def account(self, filepath):
return "Inkomsten:Statiegeld"
def tx_ref(self, filepath):
lines = pdf_to_text(filepath).split("\n")
for line in lines:
for line in pdf_lines(filepath):
if m := re.search(r"^Factuurnr : (RP\d+)$", line):
return f"SNL-{m[1]}"
raise Exception("Mollie invoice reference not found")
@ -109,8 +101,7 @@ class StatiegeldImporter(Importer):
return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath):
lines = pdf_to_text(filepath).split("\n")
for line in lines:
for line in pdf_lines(filepath):
if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Date not found")

View file

@ -22,12 +22,9 @@ set_cachier_params(
)
@cachier()
def pdf_lines(filename):
reader = PdfReader(filename)
return list(
line for page in reader.pages for line in page.extract_text().split("\n")
)
return [line for page in reader.pages for line in page.extract_text().split("\n")]
@cachier()