44import logging
55from typing import cast , Optional , BinaryIO , Union
66
7+ from email .parser import BytesParser
8+ from email .policy import default
79from pypdf import PdfReader
810from pypdf .errors import FileNotDecryptedError , PdfReadError
911
@@ -26,6 +28,51 @@ def __init__(self, message: str):
2628
2729
2830def read_pdf (pdf_file : Union [BinaryIO , bytes ]) -> Optional [PdfReader ]:
31+ reader = read_pdf_raw (pdf_file = pdf_file )
32+ if reader :
33+ return reader
34+
35+ # TODO(klaijan) - remove once debugged
36+ print ("reader is None" )
37+
38+ # load raw bytes
39+ # case bytes
40+ if isinstance (pdf_file , bytes ):
41+ raw = pdf_file
42+ # case BinaryIO
43+ elif hasattr (pdf_file , "read" ):
44+ try :
45+ pdf_file .seek (0 )
46+ raw = pdf_file .read ()
47+ except Exception as e :
48+ raise IOError (f"Failed to read file stream: { e } " )
49+ else :
50+ raise TypeError ("Expected bytes or a file-like object with 'read()' method" )
51+
52+ # multipart extraction
53+ try :
54+ msg = BytesParser (policy = default ).parsebytes (raw )
55+ for part in msg .walk ():
56+ if part .get_content_type () == "application/pdf" :
57+ pdf_bytes = part .get_payload (decode = True )
58+ _check_pdf_bytes (pdf_bytes )
59+ except Exception :
60+ # TODO(klaijan)
61+ pass
62+
63+ # look for %PDF-
64+ try :
65+ start = raw .find (b"%PDF-" )
66+ if start != - 1 :
67+ sliced = raw [start :]
68+ _check_pdf_bytes (sliced )
69+ return PdfReader (io .BytesIO (sliced ), strict = False )
70+ except Exception :
71+ # TODO(klaijan)
72+ pass
73+
74+
75+ def read_pdf_raw (pdf_file : Union [BinaryIO , bytes ]) -> Optional [PdfReader ]:
2976 """Reads the given PDF file.
3077
3178 Args:
@@ -73,3 +120,14 @@ def check_pdf(pdf: PdfReader) -> PdfReader:
73120 raise PDFValidationError (
74121 f"File does not appear to be a valid PDF. Error: { e } " ,
75122 ) from e
123+
124+
125+ def _check_pdf_bytes (pdf_bytes ) -> PdfReader :
126+ try :
127+ pdf = PdfReader (io .BytesIO (pdf_bytes ), strict = True )
128+ pdf .root_object
129+ list (pdf .pages )
130+ return pdf
131+ except :
132+ # TODO(klaijan) exception
133+ pass
0 commit comments