boekhouding-beancount/import/pdf.py

from pypdf import PdfReader
from beangulp import mimetypes, Importer
from beangulp.cache import cache
import re
from datetime import date
from beancount.core.data import Transaction, Posting, Document
from beancount.core import flags
from beancount.core import data
from beancount.core import amount
from beancount.core.number import D


@cache
def pdf_to_text(filename):
    reader = PdfReader(filename)
    page1 = reader.pages[0]
    return page1.extract_text()


class MollieInvoiceImporter(Importer):
    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
        if mimetype != "application/pdf":
            return False

        lines = pdf_to_text(filepath).split("\n")
        return any(line.startswith("Mollie B.V.") for line in lines)

    def account(self, filepath):
        return "Uitgaven:Bankkosten"

    def tx_ref(self, filepath):
        lines = pdf_to_text(filepath).split("\n")
        for line in lines:
            if m := re.search(r"^Invoice reference (MOL-.+)$", line):
                return m[1]
        raise Exception("Mollie invoice reference not found")

    def filename(self, filepath):
        return f"{self.tx_ref(filepath)}.pdf"

    def date(self, filepath):
        lines = pdf_to_text(filepath).split("\n")
        for line in lines:
            if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
                return date(int(m[3]), int(m[2]), int(m[1]))
        raise Exception("Mollie invoice date not found")

    def extract(self, filepath, existing):
        lines = pdf_to_text(filepath).split("\n")
        for line in lines:
            if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
                total = D(m[1])
                break
        else:
            raise Exception("Mollie invoice total not found")

        contra = "Passiva:RevBank"

        name = self.filename(filepath)
        date = self.date(filepath)
        link = self.tx_ref(filepath)

        units = amount.Amount(total, "EUR")
        doc = Document(
            meta=data.new_metadata(filepath, 0),
            date=date,
            account=self.account(filepath),
            filename=f"docs/Uitgaven/Bankkosten/{date}.{name}",
            tags=set(),
            links={link},
        )
        tx = Transaction(
            meta=data.new_metadata(filepath, 0),
            date=date,
            flag=flags.FLAG_OKAY,
            payee="Mollie B.V.",
            narration="iDeal transactiekosten",
            tags=set(),
            links={link},
            postings=[
                Posting(self.account(filepath), units, None, None, None, None),
                Posting(contra, -units, None, None, None, None),
            ],
        )
        return [tx, doc]


class StatiegeldImporter(Importer):
    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
        if mimetype != "application/pdf":
            return False

        lines = pdf_to_text(filepath).split("\n")
        return any(line.startswith("Statiegeld Nederland") for line in lines)

    def account(self, filepath):
        return "Inkomsten:Statiegeld"

    def tx_ref(self, filepath):
        lines = pdf_to_text(filepath).split("\n")
        for line in lines:
            if m := re.search(r"^Factuurnr : (RP\d+)$", line):
                return f"SNL-{m[1]}"
        raise Exception("Mollie invoice reference not found")

    def filename(self, filepath):
        return f"{self.tx_ref(filepath)}.pdf"

    def date(self, filepath):
        lines = pdf_to_text(filepath).split("\n")
        for line in lines:
            if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
                return date(int(m[3]), int(m[2]), int(m[1]))
        raise Exception("Date not found")

    def extract(self, filepath, existing):
        name = self.filename(filepath)
        date = self.date(filepath)
        link = self.tx_ref(filepath)

        doc = Document(
            meta=data.new_metadata(filepath, 0),
            date=date,
            account=self.account(filepath),
            filename=f"docs/Inkomsten/Statiegeld/{date}.{name}",
            tags=set(),
            links={link},
        )
        return [doc]