boekhouding-beancount/import/pdf.py

122 lines
3.9 KiB
Python

import re
from datetime import date
from beancount.core import amount, data, flags
from beancount.core.data import Document, Posting, Transaction
from beancount.core.number import D
from beangulp import Importer, mimetypes
from pypdf import PdfReader
def pdf_lines(filename):
reader = PdfReader(filename)
return [line for page in reader.pages for line in page.extract_text().split("\n")]
class MollieInvoiceImporter(Importer):
def identify(self, filepath):
mimetype, encoding = mimetypes.guess_type(filepath)
if mimetype != "application/pdf":
return False
return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath))
def account(self, filepath):
return "Uitgaven:Bankkosten"
def tx_ref(self, filepath):
for line in pdf_lines(filepath):
if m := re.search(r"^Invoice reference (MOL-.+)$", line):
return m[1]
raise Exception("Mollie invoice reference not found")
def filename(self, filepath):
return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath):
for line in pdf_lines(filepath):
if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Mollie invoice date not found")
def extract(self, filepath, existing):
for line in pdf_lines(filepath):
if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
total = D(m[1])
break
else:
raise Exception("Mollie invoice total not found")
contra = "Passiva:RevBank"
name = self.filename(filepath)
date = self.date(filepath)
link = self.tx_ref(filepath)
units = amount.Amount(total, "EUR")
doc = Document(
meta=data.new_metadata(filepath, 0),
date=date,
account=self.account(filepath),
filename=f"docs/Uitgaven/Bankkosten/{date}.{name}",
tags=set(),
links={link},
)
tx = Transaction(
meta=data.new_metadata(filepath, 0),
date=date,
flag=flags.FLAG_OKAY,
payee="Mollie B.V.",
narration="iDeal transactiekosten",
tags=set(),
links={link},
postings=[
Posting(self.account(filepath), units, None, None, None, None),
Posting(contra, -units, None, None, None, None),
],
)
return [tx, doc]
class StatiegeldImporter(Importer):
def identify(self, filepath):
mimetype, encoding = mimetypes.guess_type(filepath)
if mimetype != "application/pdf":
return False
return any(
line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath)
)
def account(self, filepath):
return "Inkomsten:Statiegeld"
def tx_ref(self, filepath):
for line in pdf_lines(filepath):
if m := re.search(r"^Factuurnr : (RP\d+)$", line):
return f"SNL-{m[1]}"
raise Exception("Mollie invoice reference not found")
def filename(self, filepath):
return f"{self.tx_ref(filepath)}.pdf"
def date(self, filepath):
for line in pdf_lines(filepath):
if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
return date(int(m[3]), int(m[2]), int(m[1]))
raise Exception("Date not found")
def extract(self, filepath, existing):
name = self.filename(filepath)
date = self.date(filepath)
link = self.tx_ref(filepath)
doc = Document(
meta=data.new_metadata(filepath, 0),
date=date,
account=self.account(filepath),
filename=f"docs/Inkomsten/Statiegeld/{date}.{name}",
tags=set(),
links={link},
)
return [doc]