diff --git a/scripts/import-generic.py b/scripts/import-generic.py index 47e3a6b..13761c2 100755 --- a/scripts/import-generic.py +++ b/scripts/import-generic.py @@ -47,7 +47,9 @@ class OllamaImporter(Importer): def identify(self, filepath): mimetype, encoding = mimetypes.guess_type(filepath) - return mimetype == "application/pdf" + if mimetype != "application/pdf": + return False + return "".join(pdf_lines(filepath)) != "" def account(self, filepath): return self.file_account diff --git a/scripts/ocr-pdf.sh b/scripts/ocr-pdf.sh new file mode 100755 index 0000000..7dfd520 --- /dev/null +++ b/scripts/ocr-pdf.sh @@ -0,0 +1,6 @@ +#!/bin/bash -eux + +in=$1 +out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}")-ocr.pdf + +ocrmypdf --language eng+nld --tesseract-pagesegmode 11 "$in" "$out"