import/pdf: Read text from all pages
This commit is contained in:
parent
fe53249bff
commit
5344c5b978
2 changed files with 17 additions and 29 deletions
|
@ -1,20 +1,16 @@
|
||||||
from pypdf import PdfReader
|
|
||||||
from beangulp import mimetypes, Importer
|
|
||||||
from beangulp.cache import cache
|
|
||||||
import re
|
import re
|
||||||
from datetime import date
|
from datetime import date
|
||||||
from beancount.core.data import Transaction, Posting, Document
|
|
||||||
from beancount.core import flags
|
from beancount.core import amount, data, flags
|
||||||
from beancount.core import data
|
from beancount.core.data import Document, Posting, Transaction
|
||||||
from beancount.core import amount
|
|
||||||
from beancount.core.number import D
|
from beancount.core.number import D
|
||||||
|
from beangulp import Importer, mimetypes
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
|
||||||
@cache
|
def pdf_lines(filename):
|
||||||
def pdf_to_text(filename):
|
|
||||||
reader = PdfReader(filename)
|
reader = PdfReader(filename)
|
||||||
page1 = reader.pages[0]
|
return [line for page in reader.pages for line in page.extract_text().split("\n")]
|
||||||
return page1.extract_text()
|
|
||||||
|
|
||||||
|
|
||||||
class MollieInvoiceImporter(Importer):
|
class MollieInvoiceImporter(Importer):
|
||||||
|
@ -23,15 +19,13 @@ class MollieInvoiceImporter(Importer):
|
||||||
if mimetype != "application/pdf":
|
if mimetype != "application/pdf":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
return any(line.startswith("Mollie B.V.") for line in pdf_lines(filepath))
|
||||||
return any(line.startswith("Mollie B.V.") for line in lines)
|
|
||||||
|
|
||||||
def account(self, filepath):
|
def account(self, filepath):
|
||||||
return "Uitgaven:Bankkosten"
|
return "Uitgaven:Bankkosten"
|
||||||
|
|
||||||
def tx_ref(self, filepath):
|
def tx_ref(self, filepath):
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
for line in pdf_lines(filepath):
|
||||||
for line in lines:
|
|
||||||
if m := re.search(r"^Invoice reference (MOL-.+)$", line):
|
if m := re.search(r"^Invoice reference (MOL-.+)$", line):
|
||||||
return m[1]
|
return m[1]
|
||||||
raise Exception("Mollie invoice reference not found")
|
raise Exception("Mollie invoice reference not found")
|
||||||
|
@ -40,15 +34,13 @@ class MollieInvoiceImporter(Importer):
|
||||||
return f"{self.tx_ref(filepath)}.pdf"
|
return f"{self.tx_ref(filepath)}.pdf"
|
||||||
|
|
||||||
def date(self, filepath):
|
def date(self, filepath):
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
for line in pdf_lines(filepath):
|
||||||
for line in lines:
|
|
||||||
if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
|
if m := re.search(r"^Invoice date (\d{2})-(\d{2})-(\d{4})$", line):
|
||||||
return date(int(m[3]), int(m[2]), int(m[1]))
|
return date(int(m[3]), int(m[2]), int(m[1]))
|
||||||
raise Exception("Mollie invoice date not found")
|
raise Exception("Mollie invoice date not found")
|
||||||
|
|
||||||
def extract(self, filepath, existing):
|
def extract(self, filepath, existing):
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
for line in pdf_lines(filepath):
|
||||||
for line in lines:
|
|
||||||
if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
|
if m := re.search(r"^Total including VAT ([\d.]+) EUR$", line):
|
||||||
total = D(m[1])
|
total = D(m[1])
|
||||||
break
|
break
|
||||||
|
@ -92,15 +84,15 @@ class StatiegeldImporter(Importer):
|
||||||
if mimetype != "application/pdf":
|
if mimetype != "application/pdf":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
return any(
|
||||||
return any(line.startswith("Statiegeld Nederland") for line in lines)
|
line.startswith("Statiegeld Nederland") for line in pdf_lines(filepath)
|
||||||
|
)
|
||||||
|
|
||||||
def account(self, filepath):
|
def account(self, filepath):
|
||||||
return "Inkomsten:Statiegeld"
|
return "Inkomsten:Statiegeld"
|
||||||
|
|
||||||
def tx_ref(self, filepath):
|
def tx_ref(self, filepath):
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
for line in pdf_lines(filepath):
|
||||||
for line in lines:
|
|
||||||
if m := re.search(r"^Factuurnr : (RP\d+)$", line):
|
if m := re.search(r"^Factuurnr : (RP\d+)$", line):
|
||||||
return f"SNL-{m[1]}"
|
return f"SNL-{m[1]}"
|
||||||
raise Exception("Mollie invoice reference not found")
|
raise Exception("Mollie invoice reference not found")
|
||||||
|
@ -109,8 +101,7 @@ class StatiegeldImporter(Importer):
|
||||||
return f"{self.tx_ref(filepath)}.pdf"
|
return f"{self.tx_ref(filepath)}.pdf"
|
||||||
|
|
||||||
def date(self, filepath):
|
def date(self, filepath):
|
||||||
lines = pdf_to_text(filepath).split("\n")
|
for line in pdf_lines(filepath):
|
||||||
for line in lines:
|
|
||||||
if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
|
if m := re.search(r"^Datum : (\d{2})/(\d{2})/(\d{4})$", line):
|
||||||
return date(int(m[3]), int(m[2]), int(m[1]))
|
return date(int(m[3]), int(m[2]), int(m[1]))
|
||||||
raise Exception("Date not found")
|
raise Exception("Date not found")
|
||||||
|
|
|
@ -22,12 +22,9 @@ set_cachier_params(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@cachier()
|
|
||||||
def pdf_lines(filename):
|
def pdf_lines(filename):
|
||||||
reader = PdfReader(filename)
|
reader = PdfReader(filename)
|
||||||
return list(
|
return [line for page in reader.pages for line in page.extract_text().split("\n")]
|
||||||
line for page in reader.pages for line in page.extract_text().split("\n")
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@cachier()
|
@cachier()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue