Skip to content

Commit 0c95c76

Browse files
authored
chore Use pdf library to check file without extension (#184)
### Summary Instead of manually checking filename with `.pdf` extension and return is_pdf = false -> use currently pdf library to read file content to decide if the file content is valid pdf
1 parent 4bb4369 commit 0c95c76

File tree

3 files changed

+32
-19
lines changed

3 files changed

+32
-19
lines changed

_test_unstructured_client/integration/test_decorators.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
],
3333
)
3434
def test_integration_split_pdf_has_same_output_as_non_split(
35-
concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog
35+
concurrency_level: int, filename: str, expected_ok: bool, strategy: str
3636
):
3737
"""
3838
Tests that output that we get from the split-by-page pdf is the same as from non-split.
@@ -74,7 +74,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
7474
resp_split = client.general.partition(request=req)
7575
except (HTTPValidationError, AttributeError) as exc:
7676
if not expected_ok:
77-
assert "The file does not appear to be a valid PDF." in caplog.text
7877
assert "File does not appear to be a valid PDF" in str(exc)
7978
return
8079
else:

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error():
223223

224224

225225
def test_unit_is_pdf_valid_pdf():
226-
"""Test is pdf method returns True for valid pdf file (has .pdf extension and can be read)."""
226+
"""Test is pdf method returns True for valid pdf file with filename."""
227227
filename = "_sample_docs/layout-parser-paper-fast.pdf"
228228

229229
with open(filename, "rb") as f:
@@ -237,28 +237,48 @@ def test_unit_is_pdf_valid_pdf():
237237
assert result is True
238238

239239

240-
def test_unit_is_pdf_invalid_extension(caplog):
240+
def test_unit_is_pdf_valid_pdf_without_file_extension():
241+
"""Test is pdf method returns True for file with valid pdf content without basing on file extension."""
242+
filename = "_sample_docs/layout-parser-paper-fast.pdf"
243+
244+
with open(filename, "rb") as f:
245+
file = shared.Files(
246+
content=f.read(),
247+
file_name="uuid1234",
248+
)
249+
250+
result = pdf_utils.is_pdf(file)
251+
252+
assert result is True
253+
254+
255+
def test_unit_is_pdf_invalid_extension():
241256
"""Test is pdf method returns False for file with invalid extension."""
242257
file = shared.Files(content=b"txt_content", file_name="test_file.txt")
243258

244-
with caplog.at_level(logging.INFO):
245-
result = pdf_utils.is_pdf(file)
259+
result = pdf_utils.is_pdf(file)
246260

247261
assert result is False
248-
assert "Given file doesn't have '.pdf' extension" in caplog.text
249262

250263

251-
def test_unit_is_pdf_invalid_pdf(caplog):
264+
def test_unit_is_pdf_invalid_pdf():
252265
"""Test is pdf method returns False for file with invalid pdf content."""
253266
file = shared.Files(content=b"invalid_pdf_content", file_name="test_file.pdf")
254267

255-
with caplog.at_level(logging.WARNING):
256-
result = pdf_utils.is_pdf(file)
268+
result = pdf_utils.is_pdf(file)
257269

258270
assert result is False
259-
assert "The file does not appear to be a valid PDF." in caplog.text
260271

261272

273+
def test_unit_is_pdf_invalid_pdf_without_file_extension():
274+
"""Test is pdf method returns False for file with invalid pdf content without basing on file extension."""
275+
file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234")
276+
277+
result = pdf_utils.is_pdf(file)
278+
279+
assert result is False
280+
281+
262282
def test_unit_get_starting_page_number_missing_key():
263283
"""Test _get_starting_page_number method with missing key."""
264284
form_data = {}

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,25 +56,19 @@ def get_pdf_pages(
5656
def is_pdf(file: shared.Files) -> bool:
5757
"""Checks if the given file is a PDF.
5858
59-
First it checks the file extension and if it is equal to `.pdf`, then
60-
it tries to read that file. If there is no error then we assume it is a proper PDF.
59+
Tries to read that file. If there is no error then we assume it is a proper PDF.
6160
6261
Args:
6362
file: The file to be checked.
6463
6564
Returns:
6665
True if the file is a PDF, False otherwise.
6766
"""
68-
if not file.file_name.endswith(".pdf"):
69-
logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
70-
return False
7167

7268
try:
7369
content = cast(bytes, file.content)
7470
PdfReader(io.BytesIO(content), strict=True)
75-
except (PdfReadError, UnicodeDecodeError) as exc:
76-
logger.error(exc)
77-
logger.warning("The file does not appear to be a valid PDF.")
71+
except (PdfReadError, UnicodeDecodeError):
7872
return False
7973

8074
return True

0 commit comments

Comments
 (0)