44import logging
55from typing import cast , Optional , BinaryIO , Union
66
7+ from email .message import Message
78from email .parser import BytesParser
8- from email .policy import default
9+ from email .policy import ( default , Policy )
910from pypdf import PdfReader
1011from pypdf .errors import FileNotDecryptedError , PdfReadError
1112
@@ -45,23 +46,22 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
4546 pdf_file .seek (0 )
4647 raw = pdf_file .read ()
4748 except Exception as e :
48- raise IOError (f"Failed to read file stream: { e } " )
49+ raise IOError (f"Failed to read file stream: { e } " ) from e
4950 else :
50- raise TypeError ("Expected bytes or a file-like object with 'read()' method" )
51+ raise IOError ("Expected bytes or a file-like object with 'read()' method" )
5152
5253 # This looks for multipart extraction
5354 try :
54- msg = BytesParser (policy = default ).parsebytes (raw )
55+ msg = BytesParser (policy = cast ( Policy [ Message ], default ) ).parsebytes (raw )
5556 for part in msg .walk ():
5657 if part .get_content_type () == "application/pdf" :
5758 pdf_bytes = part .get_payload (decode = True )
5859 if not isinstance (pdf_bytes , bytes ):
5960 continue
60- pdf_bytes = cast (bytes , pdf_bytes )
6161 pdf = PdfReader (io .BytesIO (pdf_bytes ), strict = False )
6262 return check_pdf (pdf )
6363 except Exception as e :
64- pdf_logger .debug (f "Multipart extraction failed: { e } " )
64+ pdf_logger .debug ("Multipart extraction failed: %s" , e )
6565
6666 # This looks for %PDF-
6767 try :
@@ -71,7 +71,7 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
7171 pdf = PdfReader (io .BytesIO (sliced ), strict = False )
7272 return check_pdf (pdf )
7373 except Exception as e :
74- pdf_logger .debug (f"% PDF- slicing fallback failed: { e } " )
74+ pdf_logger .debug ("%% PDF- slicing fallback failed: %s" , e )
7575
7676 return None
7777
0 commit comments