Add declaraties

2025-06-29 21:57:46 +02:00 · 2025-06-29 21:57:46 +02:00 · a75c51ec42
commit a75c51ec42
parent 6d00df1225
38 changed files with 485 additions and 23 deletions
--- a/scripts/import-generic.py
+++ b/scripts/import-generic.py
@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+from datetime import datetime, timedelta
+from locale import LC_TIME, setlocale
+from os.path import basename
+
+from beancount.core import amount, data, flags
+from beancount.core.data import Document, Posting, Transaction
+from beancount.core.number import ZERO, D
+from beangulp import Importer, Ingest, mimetypes
+from beangulp.testing import main
+from cachier import set_global_params as set_cachier_params, cachier
+from ollama import chat
+from pypdf import PdfReader
+
+set_cachier_params(
+    cache_dir="~/.cache/beangulp",
+    stale_after=timedelta(days=3),
+    separate_files=True,
+)
+
+
+@cachier()
+def pdf_lines(filename):
+    reader = PdfReader(filename)
+    return list(
+        line for page in reader.pages for line in page.extract_text().split("\n")
+    )
+
+
+@cachier()
+def pdf_query(filename, model, query):
+    text = ";".join(pdf_lines(filename))
+    resp = chat(
+        model=model,
+        messages=[
+            {"role": "user", "content": f"Given the text from an invoice:\n{text}"},
+            {"role": "user", "content": query},
+        ],
+    )
+    return resp.message.content
+
+
+class OllamaImporter(Importer):
+    def __init__(self, account, model):
+        self.file_account = account
+        self.model = model
+
+    def identify(self, filepath):
+        mimetype, encoding = mimetypes.guess_type(filepath)
+        return mimetype == "application/pdf"
+
+    def account(self, filepath):
+        return self.file_account
+
+    def filename(self, filepath):
+        return f"{self.supplier_short(filepath)}_{basename(filepath)}"
+
+    def date(self, filepath):
+        # gemma3:12b - 10/10, but format varies
+        resp = pdf_query(filepath, self.model, "Output only the date of this invoice")
+        resp = resp.strip()
+        resp = re.sub(r"[,./ \n]+", "-", resp)
+        resp = "-".join(resp.split("-")[:3])
+
+        try:
+            setlocale(LC_TIME, "nl_NL.UTF-8")
+            for fmt in [
+                "%Y-%m-%d",
+                "%d-%m-%Y",
+                "%d-%b-%Y",
+                "%Y-%b-%d",
+                "%d-%B-%Y",
+                "%Y-%B-%d",
+                "%b-%d-%Y",
+                "%B-%d-%Y",
+            ]:
+                try:
+                    return datetime.strptime(resp, fmt).date()
+                except Exception:
+                    continue
+        finally:
+            setlocale(LC_TIME, "")  # Restore default.
+        print(f"could not parse date from: {resp}", file=sys.stderr)
+        return datetime.now().date()
+
+    def tx_ref(self, filepath):
+        # gemma3:12b - 5/10
+        resp = pdf_query(filepath, self.model, "Output only the ID of this invoice")
+        resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip())
+        return f"{self.supplier_short(filepath)}_{resp}"
+
+    def supplier(self, filepath):
+        # gemma3:12b - 10/10
+        resp = pdf_query(
+            filepath,
+            self.model,
+            "Output only the name of the supplier of this invoice",
+        )
+        return resp.split("\n", 1)[0].strip()
+
+    def supplier_short(self, filepath):
+        s = self.supplier(filepath)
+        return s.replace(" ", "")[:8].strip().upper()
+
+    def extract(self, filepath, existing):
+        name = self.filename(filepath)
+        date = self.date(filepath)
+        link = self.tx_ref(filepath)
+
+        narration = pdf_query(
+            filepath,
+            self.model,
+            "Output only a single line describing the products of this invoice",
+        )
+        narration = narration.split("\n", 1)[0].strip()
+
+        supplier = self.supplier(filepath)
+
+        total_amount_str = pdf_query(
+            filepath,
+            self.model,
+            "Output only the total amount to be paid of this invoice",
+        )
+        total_amount_str = total_amount_str.replace(",", ".").lstrip("€ ")
+        try:
+            total_amount = D(total_amount_str)
+        except Exception:
+            total_amount = ZERO
+        units = amount.Amount(total_amount, "EUR")
+
+        meta = data.new_metadata(filepath, 0)
+
+        tx = Transaction(
+            meta=meta,
+            date=date,
+            flag=flags.FLAG_OKAY,
+            payee=None,
+            narration=f"{supplier}: {narration}",
+            tags=set(),
+            links={link},
+            postings=[
+                Posting(
+                    self.account(filepath), -units, None, None, flags.FLAG_WARNING, None
+                ),
+            ],
+        )
+        doc = Document(
+            meta=meta,
+            date=date,
+            account=self.account(filepath),
+            filename=f"../docs/Passiva/Crediteuren/{date}.{name}",
+            tags=set(),
+            links={link},
+        )
+        return [tx, doc]
+
+
+if __name__ == "__main__":
+    importers = [
+        OllamaImporter("Passiva:Crediteuren", "gemma3:4b"),
+    ]
+    hooks = []
+    main = Ingest(importers, hooks)
+    main()
--- a/scripts/ocr-image.sh
+++ b/scripts/ocr-image.sh
@ -0,0 +1,9 @@
+#!/bin/bash -eux
+
+in=$1
+out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}").pdf
+tmp=$(mktemp --suffix=.pdf)
+
+magick "$in" "$tmp"
+ocrmypdf $tmp "$out"
+rm "$tmp"