Skip to content

Commit 4de9a67

Browse files
fix: return appropriate error for failing pdf, before sending the request
1 parent c380230 commit 4de9a67

File tree

2 files changed

+50
-1
lines changed

2 files changed

+50
-1
lines changed

src/unstructured_client/_hooks/custom/pdf_utils.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from typing import cast, Optional, BinaryIO, Union
66

77
from pypdf import PdfReader
8-
from pypdf.errors import PdfReadError
8+
from pypdf.errors import FileNotDecryptedError, PdfReadError
99

1010
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
1111

@@ -16,6 +16,15 @@
1616
pdf_logger = logging.getLogger("pypdf")
1717
pdf_logger.setLevel(logging.ERROR)
1818

19+
20+
class PDFValidationError(Exception):
21+
"""Base exception for PDF validation errors."""
22+
23+
def __init__(self, message: str):
24+
self.message = message
25+
super().__init__(self.message)
26+
27+
1928
def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
2029
"""Reads the given PDF file.
2130
@@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
3342
return PdfReader(pdf_file, strict=False)
3443
except (PdfReadError, UnicodeDecodeError):
3544
return None
45+
46+
47+
def check_pdf(pdf: PdfReader) -> PdfReader:
48+
"""
49+
Check if PDF is:
50+
- Encrypted
51+
- Has corrupted pages
52+
- Has corrupted root object
53+
54+
Throws:
55+
- RequestError if file is encrypted or corrupted
56+
"""
57+
try:
58+
# This will raise if the file is encrypted
59+
pdf.metadata
60+
61+
# This will raise if the file's root object is corrupted
62+
pdf.root_object
63+
64+
# This will raise if the file's pages are corrupted
65+
list(pdf.pages)
66+
67+
return pdf
68+
except FileNotDecryptedError as e:
69+
raise PDFValidationError(
70+
"File is encrypted. Please decrypt it with password.",
71+
) from e
72+
except PdfReadError as e:
73+
raise PDFValidationError(
74+
f"File does not appear to be a valid PDF. Error: {e}",
75+
) from e

src/unstructured_client/_hooks/custom/split_pdf_hook.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import aiofiles
1717
import httpx
18+
from httpx import RequestError
1819
import nest_asyncio # type: ignore
1920
from httpx import AsyncClient
2021
from pypdf import PdfReader, PdfWriter
@@ -303,6 +304,14 @@ def before_request(
303304
if pdf is None:
304305
return request
305306

307+
try:
308+
pdf = pdf_utils.check_pdf(pdf)
309+
except pdf_utils.PDFValidationError as e:
310+
raise RequestError(
311+
message=e.message,
312+
request=request,
313+
) from e
314+
306315
starting_page_number = form_utils.get_starting_page_number(
307316
form_data,
308317
key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,

0 commit comments

Comments
 (0)