From f19d9e0877e30670a8d6b87914ec6fc9f27445af Mon Sep 17 00:00:00 2001 From: polyfloyd Date: Wed, 2 Jul 2025 22:57:13 +0200 Subject: [PATCH] Be more intelligent about PDFs without OCR layer --- scripts/import-generic.py | 4 +++- scripts/ocr-pdf.sh | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100755 scripts/ocr-pdf.sh diff --git a/scripts/import-generic.py b/scripts/import-generic.py index 47e3a6b..13761c2 100755 --- a/scripts/import-generic.py +++ b/scripts/import-generic.py @@ -47,7 +47,9 @@ class OllamaImporter(Importer): def identify(self, filepath): mimetype, encoding = mimetypes.guess_type(filepath) - return mimetype == "application/pdf" + if mimetype != "application/pdf": + return False + return "".join(pdf_lines(filepath)) != "" def account(self, filepath): return self.file_account diff --git a/scripts/ocr-pdf.sh b/scripts/ocr-pdf.sh new file mode 100755 index 0000000..7dfd520 --- /dev/null +++ b/scripts/ocr-pdf.sh @@ -0,0 +1,6 @@ +#!/bin/bash -eux + +in=$1 +out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}")-ocr.pdf + +ocrmypdf --language eng+nld --tesseract-pagesegmode 11 "$in" "$out"