55from typing import cast , Optional , BinaryIO , Union
66
77from pypdf import PdfReader
8- from pypdf .errors import PdfReadError
8+ from pypdf .errors import FileNotDecryptedError , PdfReadError
99
1010from unstructured_client ._hooks .custom .common import UNSTRUCTURED_CLIENT_LOGGER_NAME
1111
1616pdf_logger = logging .getLogger ("pypdf" )
1717pdf_logger .setLevel (logging .ERROR )
1818
19+
20+ class PDFValidationError (Exception ):
21+ """Base exception for PDF validation errors."""
22+
23+ def __init__ (self , message : str ):
24+ self .message = message
25+ super ().__init__ (self .message )
26+
27+
1928def read_pdf (pdf_file : Union [BinaryIO , bytes ]) -> Optional [PdfReader ]:
2029 """Reads the given PDF file.
2130
@@ -33,3 +42,34 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
3342 return PdfReader (pdf_file , strict = False )
3443 except (PdfReadError , UnicodeDecodeError ):
3544 return None
45+
46+
47+ def check_pdf (pdf : PdfReader ) -> PdfReader :
48+ """
49+ Check if PDF is:
50+ - Encrypted
51+ - Has corrupted pages
52+ - Has corrupted root object
53+
54+ Throws:
55+ - RequestError if file is encrypted or corrupted
56+ """
57+ try :
58+ # This will raise if the file is encrypted
59+ pdf .metadata
60+
61+ # This will raise if the file's root object is corrupted
62+ pdf .root_object
63+
64+ # This will raise if the file's pages are corrupted
65+ list (pdf .pages )
66+
67+ return pdf
68+ except FileNotDecryptedError as e :
69+ raise PDFValidationError (
70+ "File is encrypted. Please decrypt it with password." ,
71+ ) from e
72+ except PdfReadError as e :
73+ raise PDFValidationError (
74+ f"File does not appear to be a valid PDF. Error: { e } " ,
75+ ) from e
0 commit comments