Be more intelligent about PDFs without OCR layer
This commit is contained in:
parent
0cb21ffae6
commit
f19d9e0877
2 changed files with 9 additions and 1 deletions
|
@ -47,7 +47,9 @@ class OllamaImporter(Importer):
|
||||||
|
|
||||||
def identify(self, filepath):
|
def identify(self, filepath):
|
||||||
mimetype, encoding = mimetypes.guess_type(filepath)
|
mimetype, encoding = mimetypes.guess_type(filepath)
|
||||||
return mimetype == "application/pdf"
|
if mimetype != "application/pdf":
|
||||||
|
return False
|
||||||
|
return "".join(pdf_lines(filepath)) != ""
|
||||||
|
|
||||||
def account(self, filepath):
|
def account(self, filepath):
|
||||||
return self.file_account
|
return self.file_account
|
||||||
|
|
6
scripts/ocr-pdf.sh
Executable file
6
scripts/ocr-pdf.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash -eux
|
||||||
|
|
||||||
|
in=$1
|
||||||
|
out=$(dirname "$in")/$(basename -- "$in" ".${in##*.}")-ocr.pdf
|
||||||
|
|
||||||
|
ocrmypdf --language eng+nld --tesseract-pagesegmode 11 "$in" "$out"
|
Loading…
Add table
Add a link
Reference in a new issue