@@ -49,24 +49,13 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
4949 else :
5050 raise IOError ("Expected bytes or a file-like object with 'read()' method" )
5151
52- # This looks for multipart extraction
53- try :
54- msg = BytesParser (policy = cast (Policy , default )).parsebytes (raw )
55- for part in msg .walk ():
56- if part .get_content_type () == "application/pdf" :
57- pdf_bytes = part .get_payload (decode = True )
58- if not isinstance (pdf_bytes , bytes ):
59- continue
60- pdf = PdfReader (io .BytesIO (pdf_bytes ), strict = False )
61- return check_pdf (pdf )
62- except Exception as e :
63- pdf_logger .debug ("Multipart extraction failed: %s" , e )
64-
52+ # breakpoint()
6553 # This looks for %PDF-
6654 try :
6755 start = raw .find (b"%PDF-" )
56+ end = raw .find (b"%%EOF" ) + len (b"%%EOF" )
6857 if start != - 1 :
69- sliced = raw [start :]
58+ sliced = raw [start :end ]
7059 pdf = PdfReader (io .BytesIO (sliced ), strict = False )
7160 return check_pdf (pdf )
7261 except Exception as e :
@@ -84,13 +73,20 @@ def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
8473 Returns:
8574 The PdfReader object if the file is a PDF, None otherwise.
8675 """
87-
8876 try :
8977 if isinstance (pdf_file , bytes ):
9078 content = cast (bytes , pdf_file )
9179 pdf_file = io .BytesIO (content )
92- return PdfReader (pdf_file , strict = False )
93- except (PdfReadError , UnicodeDecodeError ):
80+ reader = PdfReader (pdf_file , strict = False )
81+ return check_pdf (reader )
82+ except (PdfReadError , UnicodeDecodeError ) as e :
83+ pdf_logger .debug ("Read pdf failed: %s" , e )
84+ return None
85+ except PDFValidationError as e :
86+ pdf_logger .debug ("Check pdf failed: %s" , e )
87+ return None
88+ except Exception as e :
89+ pdf_logger .debug ("An unexpected error occurred: %s" , e )
9490 return None
9591
9692
0 commit comments