Be more intelligent about PDFs without OCR layer

This commit is contained in:
polyfloyd 2025-07-02 22:57:13 +02:00
parent 0cb21ffae6
commit f19d9e0877
2 changed files with 9 additions and 1 deletions

View file

@ -47,7 +47,9 @@ class OllamaImporter(Importer):
def identify(self, filepath): def identify(self, filepath):
mimetype, encoding = mimetypes.guess_type(filepath) mimetype, encoding = mimetypes.guess_type(filepath)
return mimetype == "application/pdf" if mimetype != "application/pdf":
return False
return "".join(pdf_lines(filepath)) != ""
def account(self, filepath): def account(self, filepath):
return self.file_account return self.file_account

6
scripts/ocr-pdf.sh Executable file
View file

@ -0,0 +1,6 @@
#!/bin/bash -eux
in=$1
out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}")-ocr.pdf
ocrmypdf --language eng+nld --tesseract-pagesegmode 11 "$in" "$out"