@@ -33,7 +33,7 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
3333 return reader
3434
3535 # TODO(klaijan) - remove once debugged
36- print ( "reader is None " )
36+ pdf_logger . debug ( "Initial PdfReader parsing failed, attempting fallbacks. " )
3737
3838 # load raw bytes
3939 # case bytes
@@ -55,21 +55,27 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
5555 for part in msg .walk ():
5656 if part .get_content_type () == "application/pdf" :
5757 pdf_bytes = part .get_payload (decode = True )
58- _check_pdf_bytes (pdf_bytes )
59- except Exception :
60- # TODO(klaijan)
61- pass
58+ if not isinstance (pdf_bytes , bytes ):
59+ continue
60+ pdf_bytes = cast (bytes , pdf_bytes )
61+ pdf = PdfReader (io .BytesIO (pdf_bytes ), strict = False )
62+ return check_pdf (pdf )
63+ except Exception as e :
64+ pdf_logger .debug (f"Multipart extraction failed: { e } " )
6265
6366 # look for %PDF-
6467 try :
6568 start = raw .find (b"%PDF-" )
6669 if start != - 1 :
6770 sliced = raw [start :]
68- _check_pdf_bytes (sliced )
69- return PdfReader (io .BytesIO (sliced ), strict = False )
70- except Exception :
71- # TODO(klaijan)
72- pass
71+ pdf = PdfReader (io .BytesIO (sliced ), strict = False )
72+ return check_pdf (pdf )
73+ except Exception as e :
74+ pdf_logger .debug (f"%PDF- slicing fallback failed: { e } " )
75+
76+ raise PDFValidationError (
77+ "File does not appear to be a valid PDF after all fallback attempts."
78+ )
7379
7480
7581def read_pdf_raw (pdf_file : Union [BinaryIO , bytes ]) -> Optional [PdfReader ]:
@@ -120,14 +126,3 @@ def check_pdf(pdf: PdfReader) -> PdfReader:
120126 raise PDFValidationError (
121127 f"File does not appear to be a valid PDF. Error: { e } " ,
122128 ) from e
123-
124-
125- def _check_pdf_bytes (pdf_bytes ) -> PdfReader :
126- try :
127- pdf = PdfReader (io .BytesIO (pdf_bytes ), strict = True )
128- pdf .root_object
129- list (pdf .pages )
130- return pdf
131- except :
132- # TODO(klaijan) exception
133- pass
0 commit comments