Skip to content

Commit 5c70d08

Browse files
committed
feat: tiered fallback with exception raised
1 parent 72ce8ca commit 5c70d08

File tree

1 file changed

+16
-21
lines changed

1 file changed

+16
-21
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
3333
return reader
3434

3535
# TODO(klaijan) - remove once debugged
36-
print("reader is None")
36+
pdf_logger.debug("Initial PdfReader parsing failed, attempting fallbacks.")
3737

3838
# load raw bytes
3939
# case bytes
@@ -55,21 +55,27 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
5555
for part in msg.walk():
5656
if part.get_content_type() == "application/pdf":
5757
pdf_bytes = part.get_payload(decode=True)
58-
_check_pdf_bytes(pdf_bytes)
59-
except Exception:
60-
# TODO(klaijan)
61-
pass
58+
if not isinstance(pdf_bytes, bytes):
59+
continue
60+
pdf_bytes = cast(bytes, pdf_bytes)
61+
pdf = PdfReader(io.BytesIO(pdf_bytes), strict=False)
62+
return check_pdf(pdf)
63+
except Exception as e:
64+
pdf_logger.debug(f"Multipart extraction failed: {e}")
6265

6366
# look for %PDF-
6467
try:
6568
start = raw.find(b"%PDF-")
6669
if start != -1:
6770
sliced = raw[start:]
68-
_check_pdf_bytes(sliced)
69-
return PdfReader(io.BytesIO(sliced), strict=False)
70-
except Exception:
71-
# TODO(klaijan)
72-
pass
71+
pdf = PdfReader(io.BytesIO(sliced), strict=False)
72+
return check_pdf(pdf)
73+
except Exception as e:
74+
pdf_logger.debug(f"%PDF- slicing fallback failed: {e}")
75+
76+
raise PDFValidationError(
77+
"File does not appear to be a valid PDF after all fallback attempts."
78+
)
7379

7480

7581
def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
@@ -120,14 +126,3 @@ def check_pdf(pdf: PdfReader) -> PdfReader:
120126
raise PDFValidationError(
121127
f"File does not appear to be a valid PDF. Error: {e}",
122128
) from e
123-
124-
125-
def _check_pdf_bytes(pdf_bytes) -> PdfReader:
126-
try:
127-
pdf = PdfReader(io.BytesIO(pdf_bytes), strict=True)
128-
pdf.root_object
129-
list(pdf.pages)
130-
return pdf
131-
except:
132-
# TODO(klaijan) exception
133-
pass

0 commit comments

Comments
 (0)