#!/usr/bin/env python3 import re import sys from datetime import datetime, timedelta from locale import LC_TIME, setlocale from os.path import basename from beancount.core import amount, data, flags from beancount.core.data import Document, Posting, Transaction from beancount.core.number import ZERO, D from beangulp import Importer, Ingest, mimetypes from beangulp.testing import main from cachier import set_global_params as set_cachier_params, cachier from ollama import chat from pypdf import PdfReader set_cachier_params( cache_dir="~/.cache/beangulp", stale_after=timedelta(days=3), separate_files=True, ) def pdf_lines(filename): reader = PdfReader(filename) return [line for page in reader.pages for line in page.extract_text().split("\n")] @cachier() def pdf_query(filename, model, query): text = ";".join(pdf_lines(filename)) resp = chat( model=model, messages=[ {"role": "user", "content": f"Given the text from an invoice:\n{text}"}, {"role": "user", "content": query}, ], ) return resp.message.content class OllamaImporter(Importer): def __init__(self, account, model): self.file_account = account self.model = model def identify(self, filepath): mimetype, encoding = mimetypes.guess_type(filepath) return mimetype == "application/pdf" def account(self, filepath): return self.file_account def filename(self, filepath): return f"{self.supplier_short(filepath)}_{basename(filepath)}" def date(self, filepath): # gemma3:12b - 10/10, but format varies resp = pdf_query(filepath, self.model, "Output only the date of this invoice") resp = resp.strip() resp = re.sub(r"[,./ \n]+", "-", resp) resp = "-".join(resp.split("-")[:3]) try: setlocale(LC_TIME, "nl_NL.UTF-8") for fmt in [ "%Y-%m-%d", "%d-%m-%Y", "%d-%b-%Y", "%Y-%b-%d", "%d-%B-%Y", "%Y-%B-%d", "%b-%d-%Y", "%B-%d-%Y", ]: try: return datetime.strptime(resp, fmt).date() except Exception: continue finally: setlocale(LC_TIME, "") # Restore default. print(f"could not parse date from: {resp}", file=sys.stderr) return datetime.now().date() def tx_ref(self, filepath): # gemma3:12b - 5/10 resp = pdf_query(filepath, self.model, "Output only the ID of this invoice") resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip()) return f"{self.supplier_short(filepath)}_{resp}" def supplier(self, filepath): # gemma3:12b - 10/10 resp = pdf_query( filepath, self.model, "Output only the name of the supplier of this invoice", ) return resp.split("\n", 1)[0].strip() def supplier_short(self, filepath): s = self.supplier(filepath) return s.replace(" ", "")[:8].strip().upper() def extract(self, filepath, existing): name = self.filename(filepath) date = self.date(filepath) link = self.tx_ref(filepath) narration = pdf_query( filepath, self.model, "Output only a single line describing the products of this invoice", ) narration = narration.split("\n", 1)[0].strip() supplier = self.supplier(filepath) total_amount_str = pdf_query( filepath, self.model, "Output only the total amount to be paid of this invoice", ) total_amount_str = total_amount_str.replace(",", ".").lstrip("€ ") try: total_amount = D(total_amount_str) except Exception: total_amount = ZERO units = amount.Amount(total_amount, "EUR") meta = data.new_metadata(filepath, 0) tx = Transaction( meta=meta, date=date, flag=flags.FLAG_OKAY, payee=None, narration=f"{supplier}: {narration}", tags=set(), links={link}, postings=[ Posting( self.account(filepath), -units, None, None, flags.FLAG_WARNING, None ), ], ) doc = Document( meta=meta, date=date, account=self.account(filepath), filename=f"../docs/Passiva/Crediteuren/{date}.{name}", tags=set(), links={link}, ) return [tx, doc] if __name__ == "__main__": importers = [ OllamaImporter("Passiva:Crediteuren", "gemma3:4b"), ] hooks = [] main = Ingest(importers, hooks) main()