diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index c21fa587..4fc396d0 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -32,7 +32,7 @@ ], ) def test_integration_split_pdf_has_same_output_as_non_split( - concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog + concurrency_level: int, filename: str, expected_ok: bool, strategy: str ): """ Tests that output that we get from the split-by-page pdf is the same as from non-split. @@ -74,7 +74,6 @@ def test_integration_split_pdf_has_same_output_as_non_split( resp_split = client.general.partition(request=req) except (HTTPValidationError, AttributeError) as exc: if not expected_ok: - assert "The file does not appear to be a valid PDF." in caplog.text assert "File does not appear to be a valid PDF" in str(exc) return else: diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 2899f1e2..fa3dfc4b 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error(): def test_unit_is_pdf_valid_pdf(): - """Test is pdf method returns True for valid pdf file (has .pdf extension and can be read).""" + """Test is pdf method returns True for valid pdf file with filename.""" filename = "_sample_docs/layout-parser-paper-fast.pdf" with open(filename, "rb") as f: @@ -237,28 +237,48 @@ def test_unit_is_pdf_valid_pdf(): assert result is True -def test_unit_is_pdf_invalid_extension(caplog): +def test_unit_is_pdf_valid_pdf_without_file_extension(): + """Test is pdf method returns True for file with valid pdf content without basing on file extension.""" + filename = "_sample_docs/layout-parser-paper-fast.pdf" + + with open(filename, "rb") as f: + file = shared.Files( + content=f.read(), + file_name="uuid1234", + ) + + result = pdf_utils.is_pdf(file) + + assert result is True + + +def test_unit_is_pdf_invalid_extension(): """Test is pdf method returns False for file with invalid extension.""" file = shared.Files(content=b"txt_content", file_name="test_file.txt") - with caplog.at_level(logging.INFO): - result = pdf_utils.is_pdf(file) + result = pdf_utils.is_pdf(file) assert result is False - assert "Given file doesn't have '.pdf' extension" in caplog.text -def test_unit_is_pdf_invalid_pdf(caplog): +def test_unit_is_pdf_invalid_pdf(): """Test is pdf method returns False for file with invalid pdf content.""" file = shared.Files(content=b"invalid_pdf_content", file_name="test_file.pdf") - with caplog.at_level(logging.WARNING): - result = pdf_utils.is_pdf(file) + result = pdf_utils.is_pdf(file) assert result is False - assert "The file does not appear to be a valid PDF." in caplog.text +def test_unit_is_pdf_invalid_pdf_without_file_extension(): + """Test is pdf method returns False for file with invalid pdf content without basing on file extension.""" + file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234") + + result = pdf_utils.is_pdf(file) + + assert result is False + + def test_unit_get_starting_page_number_missing_key(): """Test _get_starting_page_number method with missing key.""" form_data = {} diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 589e367b..288cbd6c 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -56,8 +56,7 @@ def get_pdf_pages( def is_pdf(file: shared.Files) -> bool: """Checks if the given file is a PDF. - First it checks the file extension and if it is equal to `.pdf`, then - it tries to read that file. If there is no error then we assume it is a proper PDF. + Tries to read that file. If there is no error then we assume it is a proper PDF. Args: file: The file to be checked. @@ -65,16 +64,11 @@ def is_pdf(file: shared.Files) -> bool: Returns: True if the file is a PDF, False otherwise. """ - if not file.file_name.endswith(".pdf"): - logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.") - return False try: content = cast(bytes, file.content) PdfReader(io.BytesIO(content), strict=True) - except (PdfReadError, UnicodeDecodeError) as exc: - logger.error(exc) - logger.warning("The file does not appear to be a valid PDF.") + except (PdfReadError, UnicodeDecodeError): return False return True