boekhouding-beancount/import/pdf.py

import re
from datetime import date

from beancount.core import amount, data, flags
from beancount.core.data import Document, Posting, Transaction
from beancount.core.number import D
from beangulp import Importer, mimetypes
from pypdf import PdfReader


def pdf_lines(filename):
    reader = PdfReader(filename)
    return [line for page in reader.pages for line in page.extract_text().split("\n")]


class MollieInvoiceImporter(Importer):
    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
        if mimetype != "application/pdf":
            return False

        return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath))

    def account(self, filepath):
        return "Uitgaven:Bankkosten"

    def tx_ref(self, filepath):
        for line in pdf_lines(filepath):
            if m := re.search(r"^Invoice reference (MOL-.+)$", line):
                return m[1]
        raise Exception("Mollie invoice reference not found")

    def filename(self, filepath):
        return f"{self.tx_ref(filepath)}.pdf"

    def date(self, filepath):
        for line in pdf_lines(filepath):
            if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
                return date(int(m[3]), int(m[2]), int(m[1]))
        raise Exception("Mollie invoice date not found")

    def extract(self, filepath, existing):
        for line in pdf_lines(filepath):
            if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
                total = D(m[1])
                break
        else:
            raise Exception("Mollie invoice total not found")

        contra = "Passiva:RevBank"

        name = self.filename(filepath)
        date = self.date(filepath)
        link = self.tx_ref(filepath)

        units = amount.Amount(total, "EUR")
        doc = Document(
            meta=data.new_metadata(filepath, 0),
            date=date,
            account=self.account(filepath),
            filename=f"docs/Uitgaven/Bankkosten/{date}.{name}",
            tags=set(),
            links={link},
        )
        tx = Transaction(
            meta=data.new_metadata(filepath, 0),
            date=date,
            flag=flags.FLAG_OKAY,
            payee="Mollie B.V.",
            narration="iDeal transactiekosten",
            tags=set(),
            links={link},
            postings=[
                Posting(self.account(filepath), units, None, None, None, None),
                Posting(contra, -units, None, None, None, None),
            ],
        )
        return [tx, doc]


class StatiegeldImporter(Importer):
    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
        if mimetype != "application/pdf":
            return False

        return any(
            line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath)
        )

    def account(self, filepath):
        return "Inkomsten:Statiegeld"

    def tx_ref(self, filepath):
        for line in pdf_lines(filepath):
            if m := re.search(r"^Factuurnr : (RP\d+)$", line):
                return f"SNL-{m[1]}"
        raise Exception("Mollie invoice reference not found")

    def filename(self, filepath):
        return f"{self.tx_ref(filepath)}.pdf"

    def date(self, filepath):
        for line in pdf_lines(filepath):
            if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
                return date(int(m[3]), int(m[2]), int(m[1]))
        raise Exception("Date not found")

    def extract(self, filepath, existing):
        name = self.filename(filepath)
        date = self.date(filepath)
        link = self.tx_ref(filepath)

        doc = Document(
            meta=data.new_metadata(filepath, 0),
            date=date,
            account=self.account(filepath),
            filename=f"docs/Inkomsten/Statiegeld/{date}.{name}",
            tags=set(),
            links={link},
        )
        return [doc]