boekhouding-beancount/scripts/import-generic.py

#!/usr/bin/env python3

import re
import sys
from datetime import datetime, timedelta
from locale import LC_TIME, setlocale
from os.path import basename

from beancount.core import amount, data, flags
from beancount.core.data import Document, Posting, Transaction
from beancount.core.number import ZERO, D
from beangulp import Importer, Ingest, mimetypes
from beangulp.testing import main
from cachier import set_global_params as set_cachier_params, cachier
from ollama import chat
from pypdf import PdfReader

set_cachier_params(
    cache_dir="~/.cache/beangulp",
    stale_after=timedelta(days=3),
    separate_files=True,
)


@cachier()
def pdf_lines(filename):
    reader = PdfReader(filename)
    return list(
        line for page in reader.pages for line in page.extract_text().split("\n")
    )


@cachier()
def pdf_query(filename, model, query):
    text = ";".join(pdf_lines(filename))
    resp = chat(
        model=model,
        messages=[
            {"role": "user", "content": f"Given the text from an invoice:\n{text}"},
            {"role": "user", "content": query},
        ],
    )
    return resp.message.content


class OllamaImporter(Importer):
    def __init__(self, account, model):
        self.file_account = account
        self.model = model

    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
        return mimetype == "application/pdf"

    def account(self, filepath):
        return self.file_account

    def filename(self, filepath):
        return f"{self.supplier_short(filepath)}_{basename(filepath)}"

    def date(self, filepath):
        # gemma3:12b - 10/10, but format varies
        resp = pdf_query(filepath, self.model, "Output only the date of this invoice")
        resp = resp.strip()
        resp = re.sub(r"[,./ \n]+", "-", resp)
        resp = "-".join(resp.split("-")[:3])

        try:
            setlocale(LC_TIME, "nl_NL.UTF-8")
            for fmt in [
                "%Y-%m-%d",
                "%d-%m-%Y",
                "%d-%b-%Y",
                "%Y-%b-%d",
                "%d-%B-%Y",
                "%Y-%B-%d",
                "%b-%d-%Y",
                "%B-%d-%Y",
            ]:
                try:
                    return datetime.strptime(resp, fmt).date()
                except Exception:
                    continue
        finally:
            setlocale(LC_TIME, "")  # Restore default.
        print(f"could not parse date from: {resp}", file=sys.stderr)
        return datetime.now().date()

    def tx_ref(self, filepath):
        # gemma3:12b - 5/10
        resp = pdf_query(filepath, self.model, "Output only the ID of this invoice")
        resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip())
        return f"{self.supplier_short(filepath)}_{resp}"

    def supplier(self, filepath):
        # gemma3:12b - 10/10
        resp = pdf_query(
            filepath,
            self.model,
            "Output only the name of the supplier of this invoice",
        )
        return resp.split("\n", 1)[0].strip()

    def supplier_short(self, filepath):
        s = self.supplier(filepath)
        return s.replace(" ", "")[:8].strip().upper()

    def extract(self, filepath, existing):
        name = self.filename(filepath)
        date = self.date(filepath)
        link = self.tx_ref(filepath)

        narration = pdf_query(
            filepath,
            self.model,
            "Output only a single line describing the products of this invoice",
        )
        narration = narration.split("\n", 1)[0].strip()

        supplier = self.supplier(filepath)

        total_amount_str = pdf_query(
            filepath,
            self.model,
            "Output only the total amount to be paid of this invoice",
        )
        total_amount_str = total_amount_str.replace(",", ".").lstrip("€ ")
        try:
            total_amount = D(total_amount_str)
        except Exception:
            total_amount = ZERO
        units = amount.Amount(total_amount, "EUR")

        meta = data.new_metadata(filepath, 0)

        tx = Transaction(
            meta=meta,
            date=date,
            flag=flags.FLAG_OKAY,
            payee=None,
            narration=f"{supplier}: {narration}",
            tags=set(),
            links={link},
            postings=[
                Posting(
                    self.account(filepath), -units, None, None, flags.FLAG_WARNING, None
                ),
            ],
        )
        doc = Document(
            meta=meta,
            date=date,
            account=self.account(filepath),
            filename=f"../docs/Passiva/Crediteuren/{date}.{name}",
            tags=set(),
            links={link},
        )
        return [tx, doc]


if __name__ == "__main__":
    importers = [
        OllamaImporter("Passiva:Crediteuren", "gemma3:4b"),
    ]
    hooks = []
    main = Ingest(importers, hooks)
    main()