From a48710404bf162081cb4583a6b7cbab76e7cba1e Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Wed, 25 Sep 2024 11:55:48 -0700 Subject: [PATCH 1/5] without forcing file extension --- .../unit/test_split_pdf_hook.py | 31 +++++++++++++++++-- .../_hooks/custom/pdf_utils.py | 6 +--- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 2899f1e2..fd217ed0 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -223,7 +223,7 @@ def test_unit_parse_form_data_none_filename_error(): def test_unit_is_pdf_valid_pdf(): - """Test is pdf method returns True for valid pdf file (has .pdf extension and can be read).""" + """Test is pdf method returns True for valid pdf file with filename.""" filename = "_sample_docs/layout-parser-paper-fast.pdf" with open(filename, "rb") as f: @@ -237,15 +237,30 @@ def test_unit_is_pdf_valid_pdf(): assert result is True +def test_unit_is_pdf_valid_pdf_without_file_extension(caplog): + """Test is pdf method returns True for file with valid pdf content without basing on file extension.""" + filename = "_sample_docs/layout-parser-paper-fast.pdf" + + with open(filename, "rb") as f: + file = shared.Files( + content=f.read(), + file_name="uuid1234", + ) + + result = pdf_utils.is_pdf(file) + + assert result is True + + def test_unit_is_pdf_invalid_extension(caplog): """Test is pdf method returns False for file with invalid extension.""" file = shared.Files(content=b"txt_content", file_name="test_file.txt") - with caplog.at_level(logging.INFO): + with caplog.at_level(logging.WARNING): result = pdf_utils.is_pdf(file) assert result is False - assert "Given file doesn't have '.pdf' extension" in caplog.text + assert "The file does not appear to be a valid PDF." in caplog.text def test_unit_is_pdf_invalid_pdf(caplog): @@ -258,6 +273,16 @@ def test_unit_is_pdf_invalid_pdf(caplog): assert result is False assert "The file does not appear to be a valid PDF." in caplog.text +def test_unit_is_pdf_invalid_pdf_without_file_extension(caplog): + """Test is pdf method returns False for file with invalid pdf content without basing on file extension.""" + file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234") + + with caplog.at_level(logging.WARNING): + result = pdf_utils.is_pdf(file) + + assert result is False + assert "The file does not appear to be a valid PDF." in caplog.text + def test_unit_get_starting_page_number_missing_key(): """Test _get_starting_page_number method with missing key.""" diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 589e367b..68ad11e0 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -56,8 +56,7 @@ def get_pdf_pages( def is_pdf(file: shared.Files) -> bool: """Checks if the given file is a PDF. - First it checks the file extension and if it is equal to `.pdf`, then - it tries to read that file. If there is no error then we assume it is a proper PDF. + Tries to read that file. If there is no error then we assume it is a proper PDF. Args: file: The file to be checked. @@ -65,9 +64,6 @@ def is_pdf(file: shared.Files) -> bool: Returns: True if the file is a PDF, False otherwise. """ - if not file.file_name.endswith(".pdf"): - logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.") - return False try: content = cast(bytes, file.content) From 0a7370abe52bcc3496818c835f216fbb439da778 Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Thu, 26 Sep 2024 15:07:33 -0700 Subject: [PATCH 2/5] remove unnecessary log --- .../unit/test_split_pdf_hook.py | 21 +++++++------------ .../_hooks/custom/pdf_utils.py | 4 +--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index fd217ed0..fa3dfc4b 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -237,7 +237,7 @@ def test_unit_is_pdf_valid_pdf(): assert result is True -def test_unit_is_pdf_valid_pdf_without_file_extension(caplog): +def test_unit_is_pdf_valid_pdf_without_file_extension(): """Test is pdf method returns True for file with valid pdf content without basing on file extension.""" filename = "_sample_docs/layout-parser-paper-fast.pdf" @@ -252,36 +252,31 @@ def test_unit_is_pdf_valid_pdf_without_file_extension(caplog): assert result is True -def test_unit_is_pdf_invalid_extension(caplog): +def test_unit_is_pdf_invalid_extension(): """Test is pdf method returns False for file with invalid extension.""" file = shared.Files(content=b"txt_content", file_name="test_file.txt") - with caplog.at_level(logging.WARNING): - result = pdf_utils.is_pdf(file) + result = pdf_utils.is_pdf(file) assert result is False - assert "The file does not appear to be a valid PDF." in caplog.text -def test_unit_is_pdf_invalid_pdf(caplog): +def test_unit_is_pdf_invalid_pdf(): """Test is pdf method returns False for file with invalid pdf content.""" file = shared.Files(content=b"invalid_pdf_content", file_name="test_file.pdf") - with caplog.at_level(logging.WARNING): - result = pdf_utils.is_pdf(file) + result = pdf_utils.is_pdf(file) assert result is False - assert "The file does not appear to be a valid PDF." in caplog.text -def test_unit_is_pdf_invalid_pdf_without_file_extension(caplog): + +def test_unit_is_pdf_invalid_pdf_without_file_extension(): """Test is pdf method returns False for file with invalid pdf content without basing on file extension.""" file = shared.Files(content=b"invalid_pdf_content", file_name="uuid1234") - with caplog.at_level(logging.WARNING): - result = pdf_utils.is_pdf(file) + result = pdf_utils.is_pdf(file) assert result is False - assert "The file does not appear to be a valid PDF." in caplog.text def test_unit_get_starting_page_number_missing_key(): diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 68ad11e0..288cbd6c 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -68,9 +68,7 @@ def is_pdf(file: shared.Files) -> bool: try: content = cast(bytes, file.content) PdfReader(io.BytesIO(content), strict=True) - except (PdfReadError, UnicodeDecodeError) as exc: - logger.error(exc) - logger.warning("The file does not appear to be a valid PDF.") + except (PdfReadError, UnicodeDecodeError): return False return True From 44c7a0ecbe925f9b5ba8679c434e3eb998564a62 Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Thu, 26 Sep 2024 15:19:38 -0700 Subject: [PATCH 3/5] move log to info --- src/unstructured_client/_hooks/custom/pdf_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 288cbd6c..0be178df 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -69,6 +69,7 @@ def is_pdf(file: shared.Files) -> bool: content = cast(bytes, file.content) PdfReader(io.BytesIO(content), strict=True) except (PdfReadError, UnicodeDecodeError): + logger.info("Loading PDF failed, so splitting is not enabled.") return False return True From f4a990ceb2202952cb6a877c3886522e175f0e31 Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Thu, 26 Sep 2024 16:18:38 -0700 Subject: [PATCH 4/5] remove log at all --- src/unstructured_client/_hooks/custom/pdf_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 0be178df..288cbd6c 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -69,7 +69,6 @@ def is_pdf(file: shared.Files) -> bool: content = cast(bytes, file.content) PdfReader(io.BytesIO(content), strict=True) except (PdfReadError, UnicodeDecodeError): - logger.info("Loading PDF failed, so splitting is not enabled.") return False return True From f5a188cbffc949863c143f95f34955cda74375a7 Mon Sep 17 00:00:00 2001 From: Yuming Long Date: Wed, 2 Oct 2024 15:05:17 -0700 Subject: [PATCH 5/5] remove caplog check in integration test --- _test_unstructured_client/integration/test_decorators.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index c21fa587..4fc396d0 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -32,7 +32,7 @@ ], ) def test_integration_split_pdf_has_same_output_as_non_split( - concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog + concurrency_level: int, filename: str, expected_ok: bool, strategy: str ): """ Tests that output that we get from the split-by-page pdf is the same as from non-split. @@ -74,7 +74,6 @@ def test_integration_split_pdf_has_same_output_as_non_split( resp_split = client.general.partition(request=req) except (HTTPValidationError, AttributeError) as exc: if not expected_ok: - assert "The file does not appear to be a valid PDF." in caplog.text assert "File does not appear to be a valid PDF" in str(exc) return else: