boekhouding-beancount/scripts/import-generic.py
2025-06-29 23:51:10 +02:00

167 lines
4.8 KiB
Python
Executable file

#!/usr/bin/env python3
import re
import sys
from datetime import datetime, timedelta
from locale import LC_TIME, setlocale
from os.path import basename
from beancount.core import amount, data, flags
from beancount.core.data import Document, Posting, Transaction
from beancount.core.number import ZERO, D
from beangulp import Importer, Ingest, mimetypes
from beangulp.testing import main
from cachier import set_global_params as set_cachier_params, cachier
from ollama import chat
from pypdf import PdfReader
set_cachier_params(
cache_dir="~/.cache/beangulp",
stale_after=timedelta(days=3),
separate_files=True,
)
@cachier()
def pdf_lines(filename):
reader = PdfReader(filename)
return list(
line for page in reader.pages for line in page.extract_text().split("\n")
)
@cachier()
def pdf_query(filename, model, query):
text = ";".join(pdf_lines(filename))
resp = chat(
model=model,
messages=[
{"role": "user", "content": f"Given the text from an invoice:\n{text}"},
{"role": "user", "content": query},
],
)
return resp.message.content
class OllamaImporter(Importer):
def __init__(self, account, model):
self.file_account = account
self.model = model
def identify(self, filepath):
mimetype, encoding = mimetypes.guess_type(filepath)
return mimetype == "application/pdf"
def account(self, filepath):
return self.file_account
def filename(self, filepath):
return f"{self.supplier_short(filepath)}_{basename(filepath)}"
def date(self, filepath):
# gemma3:12b - 10/10, but format varies
resp = pdf_query(filepath, self.model, "Output only the date of this invoice")
resp = resp.strip()
resp = re.sub(r"[,./ \n]+", "-", resp)
resp = "-".join(resp.split("-")[:3])
try:
setlocale(LC_TIME, "nl_NL.UTF-8")
for fmt in [
"%Y-%m-%d",
"%d-%m-%Y",
"%d-%b-%Y",
"%Y-%b-%d",
"%d-%B-%Y",
"%Y-%B-%d",
"%b-%d-%Y",
"%B-%d-%Y",
]:
try:
return datetime.strptime(resp, fmt).date()
except Exception:
continue
finally:
setlocale(LC_TIME, "") # Restore default.
print(f"could not parse date from: {resp}", file=sys.stderr)
return datetime.now().date()
def tx_ref(self, filepath):
# gemma3:12b - 5/10
resp = pdf_query(filepath, self.model, "Output only the ID of this invoice")
resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip())
return f"{self.supplier_short(filepath)}_{resp}"
def supplier(self, filepath):
# gemma3:12b - 10/10
resp = pdf_query(
filepath,
self.model,
"Output only the name of the supplier of this invoice",
)
return resp.split("\n", 1)[0].strip()
def supplier_short(self, filepath):
s = self.supplier(filepath)
return s.replace(" ", "")[:8].strip().upper()
def extract(self, filepath, existing):
name = self.filename(filepath)
date = self.date(filepath)
link = self.tx_ref(filepath)
narration = pdf_query(
filepath,
self.model,
"Output only a single line describing the products of this invoice",
)
narration = narration.split("\n", 1)[0].strip()
supplier = self.supplier(filepath)
total_amount_str = pdf_query(
filepath,
self.model,
"Output only the total amount to be paid of this invoice",
)
total_amount_str = total_amount_str.replace(",", ".").lstrip("")
try:
total_amount = D(total_amount_str)
except Exception:
total_amount = ZERO
units = amount.Amount(total_amount, "EUR")
meta = data.new_metadata(filepath, 0)
tx = Transaction(
meta=meta,
date=date,
flag=flags.FLAG_OKAY,
payee=None,
narration=f"{supplier}: {narration}",
tags=set(),
links={link},
postings=[
Posting(
self.account(filepath), -units, None, None, flags.FLAG_WARNING, None
),
],
)
doc = Document(
meta=meta,
date=date,
account=self.account(filepath),
filename=f"../docs/Passiva/Crediteuren/{date}.{name}",
tags=set(),
links={link},
)
return [tx, doc]
if __name__ == "__main__":
importers = [
OllamaImporter("Passiva:Crediteuren", "gemma3:4b"),
]
hooks = []
main = Ingest(importers, hooks)
main()