Be more intelligent about PDFs without OCR layer

2025-07-02 22:57:13 +02:00 · 2025-07-02 22:57:13 +02:00 · f19d9e0877
commit f19d9e0877
parent 0cb21ffae6
2 changed files with 9 additions and 1 deletions
--- a/scripts/import-generic.py
+++ b/scripts/import-generic.py
@ -47,7 +47,9 @@ class OllamaImporter(Importer):
    def identify(self, filepath):
        mimetype, encoding = mimetypes.guess_type(filepath)
-        return mimetype == "application/pdf"
+        if mimetype != "application/pdf":
            return False
        return "".join(pdf_lines(filepath)) != ""
    def account(self, filepath):
        return self.file_account
--- a/scripts/ocr-pdf.sh
+++ b/scripts/ocr-pdf.sh
@ -0,0 +1,6 @@
 #!/bin/bash -eux
 in=$1
 out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}")-ocr.pdf
 ocrmypdf --language eng+nld --tesseract-pagesegmode 11 "$in" "$out"