Skip to content

Commit 96745f2

Browse files
committed
wip: test
1 parent 5deb5a7 commit 96745f2

File tree

3 files changed

+22
-17
lines changed

3 files changed

+22
-17
lines changed
46.7 KB
Binary file not shown.

_test_unstructured_client/unit/test_pdf_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ def test_check_pdf_with_valid_pdf():
2323
assert isinstance(result, PdfReader)
2424

2525

26+
# TODO(klaijan)
27+
def test_check_pdf_with_valid_pdf_multipart():
28+
pdf_path = sample_docs_path("valid-multipart-wrapped.pdf")
29+
pdf = _open_pdf(pdf_path)
30+
31+
result = check_pdf(pdf)
32+
assert isinstance(result, PdfReader)
33+
34+
2635
@pytest.mark.parametrize(
2736
("pdf_name", "expected_error_message"),
2837
[

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,24 +49,13 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
4949
else:
5050
raise IOError("Expected bytes or a file-like object with 'read()' method")
5151

52-
# This looks for multipart extraction
53-
try:
54-
msg = BytesParser(policy=cast(Policy, default)).parsebytes(raw)
55-
for part in msg.walk():
56-
if part.get_content_type() == "application/pdf":
57-
pdf_bytes = part.get_payload(decode=True)
58-
if not isinstance(pdf_bytes, bytes):
59-
continue
60-
pdf = PdfReader(io.BytesIO(pdf_bytes), strict=False)
61-
return check_pdf(pdf)
62-
except Exception as e:
63-
pdf_logger.debug("Multipart extraction failed: %s", e)
64-
52+
# breakpoint()
6553
# This looks for %PDF-
6654
try:
6755
start = raw.find(b"%PDF-")
56+
end = raw.find(b"%%EOF") + len(b"%%EOF")
6857
if start != -1:
69-
sliced = raw[start:]
58+
sliced = raw[start:end]
7059
pdf = PdfReader(io.BytesIO(sliced), strict=False)
7160
return check_pdf(pdf)
7261
except Exception as e:
@@ -84,13 +73,20 @@ def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
8473
Returns:
8574
The PdfReader object if the file is a PDF, None otherwise.
8675
"""
87-
8876
try:
8977
if isinstance(pdf_file, bytes):
9078
content = cast(bytes, pdf_file)
9179
pdf_file = io.BytesIO(content)
92-
return PdfReader(pdf_file, strict=False)
93-
except (PdfReadError, UnicodeDecodeError):
80+
reader = PdfReader(pdf_file, strict=False)
81+
return check_pdf(reader)
82+
except (PdfReadError, UnicodeDecodeError) as e:
83+
pdf_logger.debug("Read pdf failed: %s", e)
84+
return None
85+
except PDFValidationError as e:
86+
pdf_logger.debug("Check pdf failed: %s", e)
87+
return None
88+
except Exception as e:
89+
pdf_logger.debug("An unexpected error occurred: %s", e)
9490
return None
9591

9692

0 commit comments

Comments
 (0)