Skip to content

Commit 511e4ea

Browse files
authored
Merge pull request #40 from PascalEgn/fix-is_pdf-function
fix is_pdf function
2 parents 3c758cf + 4fafb8a commit 511e4ea

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

invenio_classifier/extractor.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,18 @@
4040

4141
def is_pdf(document):
4242
"""Check if a document is a PDF file and return True if is is."""
43-
if not executable_exists('pdftotext'):
43+
if executable_exists("pdftotext"):
44+
try:
45+
out = subprocess.Popen(["pdftotext", "-q", document, "-"],
46+
universal_newlines=True,
47+
stdout=subprocess.PIPE)
48+
(stdoutdata, stderrdata) = out.communicate()
49+
if stdoutdata:
50+
return True
51+
except IOError as ex1:
52+
current_app.logger.error("Unable to read from file %s. (%s)"
53+
% (document, ex1.strerror))
54+
else:
4455
current_app.logger.warning(
4556
"GNU file was not found on the system. "
4657
"Switching to a weak file extension test."
@@ -82,7 +93,6 @@ def text_lines_from_local_file(document, remote=False):
8293
current_app.logger.error(
8394
"pdftotext is not available on the system."
8495
)
85-
cmd = "pdftotext -q -enc UTF-8 %s -" % re.escape(document)
8696
out = subprocess.Popen(["pdftotext", "-q", "-enc", "UTF-8",
8797
document, "-"],
8898
universal_newlines=True,

0 commit comments

Comments
 (0)