from pypdf import PdfReader from beangulp import mimetypes, Importer from beangulp.cache import cache import re from datetime import date from beancount.core.data import Transaction, Posting, Document from beancount.core import flags from beancount.core import data from beancount.core import amount from beancount.core.number import D @cache def pdf_to_text(filename): reader = PdfReader(filename) page1 = reader.pages[0] return page1.extract_text() class MollieInvoiceImporter(Importer): def identify(self, filepath): mimetype, encoding = mimetypes.guess_type(filepath) if mimetype != "application/pdf": return False lines = pdf_to_text(filepath).split("\n") return any(line.startswith("Mollie B.V.") for line in lines) def account(self, filepath): return "Uitgaven:Bankkosten" def tx_ref(self, filepath): lines = pdf_to_text(filepath).split("\n") for line in lines: if m := re.search(r"^Invoice reference (MOL-.+)$", line): return m[1] raise Exception("Mollie invoice reference not found") def filename(self, filepath): return f"{self.tx_ref(filepath)}.pdf" def date(self, filepath): lines = pdf_to_text(filepath).split("\n") for line in lines: if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line): return date(int(m[3]), int(m[2]), int(m[1])) raise Exception("Mollie invoice date not found") def extract(self, filepath, existing): lines = pdf_to_text(filepath).split("\n") for line in lines: if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line): total = D(m[1]) break else: raise Exception("Mollie invoice total not found") contra = "Passiva:RevBank" name = self.filename(filepath) date = self.date(filepath) link = self.tx_ref(filepath) units = amount.Amount(total, "EUR") doc = Document( meta=data.new_metadata(filepath, 0), date=date, account=self.account(filepath), filename=f"docs/Uitgaven/Bankkosten/{date}.{name}", tags=set(), links={link}, ) tx = Transaction( meta=data.new_metadata(filepath, 0), date=date, flag=flags.FLAG_OKAY, payee="Mollie B.V.", narration="iDeal transactiekosten", tags=set(), links={link}, postings=[ Posting(self.account(filepath), units, None, None, None, None), Posting(contra, -units, None, None, None, None), ], ) return [tx, doc] class StatiegeldImporter(Importer): def identify(self, filepath): mimetype, encoding = mimetypes.guess_type(filepath) if mimetype != "application/pdf": return False lines = pdf_to_text(filepath).split("\n") return any(line.startswith("Statiegeld Nederland") for line in lines) def account(self, filepath): return "Inkomsten:Statiegeld" def tx_ref(self, filepath): lines = pdf_to_text(filepath).split("\n") for line in lines: if m := re.search(r"^Factuurnr : (RP\d+)$", line): return f"SNL-{m[1]}" raise Exception("Mollie invoice reference not found") def filename(self, filepath): return f"{self.tx_ref(filepath)}.pdf" def date(self, filepath): lines = pdf_to_text(filepath).split("\n") for line in lines: if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line): return date(int(m[3]), int(m[2]), int(m[1])) raise Exception("Date not found") def extract(self, filepath, existing): name = self.filename(filepath) date = self.date(filepath) link = self.tx_ref(filepath) doc = Document( meta=data.new_metadata(filepath, 0), date=date, account=self.account(filepath), filename=f"docs/Inkomsten/Statiegeld/{date}.{name}", tags=set(), links={link}, ) return [doc]