Skip to content

Commit 72ce8ca

Browse files
committed
wip: add tiered fallback for pdf
1 parent 1e3f9e4 commit 72ce8ca

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import logging
55
from typing import cast, Optional, BinaryIO, Union
66

7+
from email.parser import BytesParser
8+
from email.policy import default
79
from pypdf import PdfReader
810
from pypdf.errors import FileNotDecryptedError, PdfReadError
911

@@ -26,6 +28,51 @@ def __init__(self, message: str):
2628

2729

2830
def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
31+
reader = read_pdf_raw(pdf_file=pdf_file)
32+
if reader:
33+
return reader
34+
35+
# TODO(klaijan) - remove once debugged
36+
print("reader is None")
37+
38+
# load raw bytes
39+
# case bytes
40+
if isinstance(pdf_file, bytes):
41+
raw = pdf_file
42+
# case BinaryIO
43+
elif hasattr(pdf_file, "read"):
44+
try:
45+
pdf_file.seek(0)
46+
raw = pdf_file.read()
47+
except Exception as e:
48+
raise IOError(f"Failed to read file stream: {e}")
49+
else:
50+
raise TypeError("Expected bytes or a file-like object with 'read()' method")
51+
52+
# multipart extraction
53+
try:
54+
msg = BytesParser(policy=default).parsebytes(raw)
55+
for part in msg.walk():
56+
if part.get_content_type() == "application/pdf":
57+
pdf_bytes = part.get_payload(decode=True)
58+
_check_pdf_bytes(pdf_bytes)
59+
except Exception:
60+
# TODO(klaijan)
61+
pass
62+
63+
# look for %PDF-
64+
try:
65+
start = raw.find(b"%PDF-")
66+
if start != -1:
67+
sliced = raw[start:]
68+
_check_pdf_bytes(sliced)
69+
return PdfReader(io.BytesIO(sliced), strict=False)
70+
except Exception:
71+
# TODO(klaijan)
72+
pass
73+
74+
75+
def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
2976
"""Reads the given PDF file.
3077
3178
Args:
@@ -73,3 +120,14 @@ def check_pdf(pdf: PdfReader) -> PdfReader:
73120
raise PDFValidationError(
74121
f"File does not appear to be a valid PDF. Error: {e}",
75122
) from e
123+
124+
125+
def _check_pdf_bytes(pdf_bytes) -> PdfReader:
126+
try:
127+
pdf = PdfReader(io.BytesIO(pdf_bytes), strict=True)
128+
pdf.root_object
129+
list(pdf.pages)
130+
return pdf
131+
except:
132+
# TODO(klaijan) exception
133+
pass

0 commit comments

Comments
 (0)