167 lines
4.8 KiB
Python
Executable file
167 lines
4.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timedelta
|
|
from locale import LC_TIME, setlocale
|
|
from os.path import basename
|
|
|
|
from beancount.core import amount, data, flags
|
|
from beancount.core.data import Document, Posting, Transaction
|
|
from beancount.core.number import ZERO, D
|
|
from beangulp import Importer, Ingest, mimetypes
|
|
from beangulp.testing import main
|
|
from cachier import set_global_params as set_cachier_params, cachier
|
|
from ollama import chat
|
|
from pypdf import PdfReader
|
|
|
|
set_cachier_params(
|
|
cache_dir="~/.cache/beangulp",
|
|
stale_after=timedelta(days=3),
|
|
separate_files=True,
|
|
)
|
|
|
|
|
|
@cachier()
|
|
def pdf_lines(filename):
|
|
reader = PdfReader(filename)
|
|
return list(
|
|
line for page in reader.pages for line in page.extract_text().split("\n")
|
|
)
|
|
|
|
|
|
@cachier()
|
|
def pdf_query(filename, model, query):
|
|
text = ";".join(pdf_lines(filename))
|
|
resp = chat(
|
|
model=model,
|
|
messages=[
|
|
{"role": "user", "content": f"Given the text from an invoice:\n{text}"},
|
|
{"role": "user", "content": query},
|
|
],
|
|
)
|
|
return resp.message.content
|
|
|
|
|
|
class OllamaImporter(Importer):
|
|
def __init__(self, account, model):
|
|
self.file_account = account
|
|
self.model = model
|
|
|
|
def identify(self, filepath):
|
|
mimetype, encoding = mimetypes.guess_type(filepath)
|
|
return mimetype == "application/pdf"
|
|
|
|
def account(self, filepath):
|
|
return self.file_account
|
|
|
|
def filename(self, filepath):
|
|
return f"{self.supplier_short(filepath)}_{basename(filepath)}"
|
|
|
|
def date(self, filepath):
|
|
# gemma3:12b - 10/10, but format varies
|
|
resp = pdf_query(filepath, self.model, "Output only the date of this invoice")
|
|
resp = resp.strip()
|
|
resp = re.sub(r"[,./ \n]+", "-", resp)
|
|
resp = "-".join(resp.split("-")[:3])
|
|
|
|
try:
|
|
setlocale(LC_TIME, "nl_NL.UTF-8")
|
|
for fmt in [
|
|
"%Y-%m-%d",
|
|
"%d-%m-%Y",
|
|
"%d-%b-%Y",
|
|
"%Y-%b-%d",
|
|
"%d-%B-%Y",
|
|
"%Y-%B-%d",
|
|
"%b-%d-%Y",
|
|
"%B-%d-%Y",
|
|
]:
|
|
try:
|
|
return datetime.strptime(resp, fmt).date()
|
|
except Exception:
|
|
continue
|
|
finally:
|
|
setlocale(LC_TIME, "") # Restore default.
|
|
print(f"could not parse date from: {resp}", file=sys.stderr)
|
|
return datetime.now().date()
|
|
|
|
def tx_ref(self, filepath):
|
|
# gemma3:12b - 5/10
|
|
resp = pdf_query(filepath, self.model, "Output only the ID of this invoice")
|
|
resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip())
|
|
return f"{self.supplier_short(filepath)}_{resp}"
|
|
|
|
def supplier(self, filepath):
|
|
# gemma3:12b - 10/10
|
|
resp = pdf_query(
|
|
filepath,
|
|
self.model,
|
|
"Output only the name of the supplier of this invoice",
|
|
)
|
|
return resp.split("\n", 1)[0].strip()
|
|
|
|
def supplier_short(self, filepath):
|
|
s = self.supplier(filepath)
|
|
return s.replace(" ", "")[:8].strip().upper()
|
|
|
|
def extract(self, filepath, existing):
|
|
name = self.filename(filepath)
|
|
date = self.date(filepath)
|
|
link = self.tx_ref(filepath)
|
|
|
|
narration = pdf_query(
|
|
filepath,
|
|
self.model,
|
|
"Output only a single line describing the products of this invoice",
|
|
)
|
|
narration = narration.split("\n", 1)[0].strip()
|
|
|
|
supplier = self.supplier(filepath)
|
|
|
|
total_amount_str = pdf_query(
|
|
filepath,
|
|
self.model,
|
|
"Output only the total amount to be paid of this invoice",
|
|
)
|
|
total_amount_str = total_amount_str.replace(",", ".").lstrip("€ ")
|
|
try:
|
|
total_amount = D(total_amount_str)
|
|
except Exception:
|
|
total_amount = ZERO
|
|
units = amount.Amount(total_amount, "EUR")
|
|
|
|
meta = data.new_metadata(filepath, 0)
|
|
|
|
tx = Transaction(
|
|
meta=meta,
|
|
date=date,
|
|
flag=flags.FLAG_OKAY,
|
|
payee=None,
|
|
narration=f"{supplier}: {narration}",
|
|
tags=set(),
|
|
links={link},
|
|
postings=[
|
|
Posting(
|
|
self.account(filepath), -units, None, None, flags.FLAG_WARNING, None
|
|
),
|
|
],
|
|
)
|
|
doc = Document(
|
|
meta=meta,
|
|
date=date,
|
|
account=self.account(filepath),
|
|
filename=f"../docs/Passiva/Crediteuren/{date}.{name}",
|
|
tags=set(),
|
|
links={link},
|
|
)
|
|
return [tx, doc]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
importers = [
|
|
OllamaImporter("Passiva:Crediteuren", "gemma3:4b"),
|
|
]
|
|
hooks = []
|
|
main = Ingest(importers, hooks)
|
|
main()
|