Add declaraties
This commit is contained in:
parent
6d00df1225
commit
a75c51ec42
38 changed files with 485 additions and 23 deletions
167
scripts/import-generic.py
Executable file
167
scripts/import-generic.py
Executable file
|
@ -0,0 +1,167 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from locale import LC_TIME, setlocale
|
||||
from os.path import basename
|
||||
|
||||
from beancount.core import amount, data, flags
|
||||
from beancount.core.data import Document, Posting, Transaction
|
||||
from beancount.core.number import ZERO, D
|
||||
from beangulp import Importer, Ingest, mimetypes
|
||||
from beangulp.testing import main
|
||||
from cachier import set_global_params as set_cachier_params, cachier
|
||||
from ollama import chat
|
||||
from pypdf import PdfReader
|
||||
|
||||
set_cachier_params(
|
||||
cache_dir="~/.cache/beangulp",
|
||||
stale_after=timedelta(days=3),
|
||||
separate_files=True,
|
||||
)
|
||||
|
||||
|
||||
@cachier()
|
||||
def pdf_lines(filename):
|
||||
reader = PdfReader(filename)
|
||||
return list(
|
||||
line for page in reader.pages for line in page.extract_text().split("\n")
|
||||
)
|
||||
|
||||
|
||||
@cachier()
|
||||
def pdf_query(filename, model, query):
|
||||
text = ";".join(pdf_lines(filename))
|
||||
resp = chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": f"Given the text from an invoice:\n{text}"},
|
||||
{"role": "user", "content": query},
|
||||
],
|
||||
)
|
||||
return resp.message.content
|
||||
|
||||
|
||||
class OllamaImporter(Importer):
|
||||
def __init__(self, account, model):
|
||||
self.file_account = account
|
||||
self.model = model
|
||||
|
||||
def identify(self, filepath):
|
||||
mimetype, encoding = mimetypes.guess_type(filepath)
|
||||
return mimetype == "application/pdf"
|
||||
|
||||
def account(self, filepath):
|
||||
return self.file_account
|
||||
|
||||
def filename(self, filepath):
|
||||
return f"{self.supplier_short(filepath)}_{basename(filepath)}"
|
||||
|
||||
def date(self, filepath):
|
||||
# gemma3:12b - 10/10, but format varies
|
||||
resp = pdf_query(filepath, self.model, "Output only the date of this invoice")
|
||||
resp = resp.strip()
|
||||
resp = re.sub(r"[,./ \n]+", "-", resp)
|
||||
resp = "-".join(resp.split("-")[:3])
|
||||
|
||||
try:
|
||||
setlocale(LC_TIME, "nl_NL.UTF-8")
|
||||
for fmt in [
|
||||
"%Y-%m-%d",
|
||||
"%d-%m-%Y",
|
||||
"%d-%b-%Y",
|
||||
"%Y-%b-%d",
|
||||
"%d-%B-%Y",
|
||||
"%Y-%B-%d",
|
||||
"%b-%d-%Y",
|
||||
"%B-%d-%Y",
|
||||
]:
|
||||
try:
|
||||
return datetime.strptime(resp, fmt).date()
|
||||
except Exception:
|
||||
continue
|
||||
finally:
|
||||
setlocale(LC_TIME, "") # Restore default.
|
||||
print(f"could not parse date from: {resp}", file=sys.stderr)
|
||||
return datetime.now().date()
|
||||
|
||||
def tx_ref(self, filepath):
|
||||
# gemma3:12b - 5/10
|
||||
resp = pdf_query(filepath, self.model, "Output only the ID of this invoice")
|
||||
resp = re.sub(r"[^a-zA-Z0-9_-]", "", resp.strip())
|
||||
return f"{self.supplier_short(filepath)}_{resp}"
|
||||
|
||||
def supplier(self, filepath):
|
||||
# gemma3:12b - 10/10
|
||||
resp = pdf_query(
|
||||
filepath,
|
||||
self.model,
|
||||
"Output only the name of the supplier of this invoice",
|
||||
)
|
||||
return resp.split("\n", 1)[0].strip()
|
||||
|
||||
def supplier_short(self, filepath):
|
||||
s = self.supplier(filepath)
|
||||
return s.replace(" ", "")[:8].strip().upper()
|
||||
|
||||
def extract(self, filepath, existing):
|
||||
name = self.filename(filepath)
|
||||
date = self.date(filepath)
|
||||
link = self.tx_ref(filepath)
|
||||
|
||||
narration = pdf_query(
|
||||
filepath,
|
||||
self.model,
|
||||
"Output only a single line describing the products of this invoice",
|
||||
)
|
||||
narration = narration.split("\n", 1)[0].strip()
|
||||
|
||||
supplier = self.supplier(filepath)
|
||||
|
||||
total_amount_str = pdf_query(
|
||||
filepath,
|
||||
self.model,
|
||||
"Output only the total amount to be paid of this invoice",
|
||||
)
|
||||
total_amount_str = total_amount_str.replace(",", ".").lstrip("€ ")
|
||||
try:
|
||||
total_amount = D(total_amount_str)
|
||||
except Exception:
|
||||
total_amount = ZERO
|
||||
units = amount.Amount(total_amount, "EUR")
|
||||
|
||||
meta = data.new_metadata(filepath, 0)
|
||||
|
||||
tx = Transaction(
|
||||
meta=meta,
|
||||
date=date,
|
||||
flag=flags.FLAG_OKAY,
|
||||
payee=None,
|
||||
narration=f"{supplier}: {narration}",
|
||||
tags=set(),
|
||||
links={link},
|
||||
postings=[
|
||||
Posting(
|
||||
self.account(filepath), -units, None, None, flags.FLAG_WARNING, None
|
||||
),
|
||||
],
|
||||
)
|
||||
doc = Document(
|
||||
meta=meta,
|
||||
date=date,
|
||||
account=self.account(filepath),
|
||||
filename=f"../docs/Passiva/Crediteuren/{date}.{name}",
|
||||
tags=set(),
|
||||
links={link},
|
||||
)
|
||||
return [tx, doc]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
importers = [
|
||||
OllamaImporter("Passiva:Crediteuren", "gemma3:4b"),
|
||||
]
|
||||
hooks = []
|
||||
main = Ingest(importers, hooks)
|
||||
main()
|
9
scripts/ocr-image.sh
Executable file
9
scripts/ocr-image.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash -eux
|
||||
|
||||
in=$1
|
||||
out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}").pdf
|
||||
tmp=$(mktemp --suffix=.pdf)
|
||||
|
||||
magick "$in" "$tmp"
|
||||
ocrmypdf $tmp "$out"
|
||||
rm "$tmp"
|
Loading…
Add table
Add a link
Reference in a new issue