11import io
22import ast
33
4- import PyPDF2
4+ import pypdf
55
66from pdfminer .pdfinterp import PDFResourceManager
77from pdfminer .pdfinterp import PDFPageInterpreter
@@ -69,9 +69,9 @@ def get_toc(self, filepath, password=None):
6969 return output_toc
7070
7171
72- class PyPDF2DocParser (object ):
72+ class PyPDFDocParser (object ):
7373 """
74- PyPdf2 Doc Parser:
74+ pypdf Doc Parser:
7575
7676 methods:
7777 1. get_metadata : get metadata of pdf file
@@ -89,25 +89,25 @@ def get_text(self, filepath, password=None, maxpages=0):
8989 """ function to read all the text from pdf file """
9090 pdf_data = ""
9191 with open (filepath , "rb" ) as fp :
92- pdfReader = PyPDF2 . PdfFileReader (fp )
92+ pdfReader = pypdf . PdfReader (fp )
9393 if password :
9494 pdfReader .decrypt (password )
95- num_pages = pdfReader .numPages
95+ num_pages = len ( pdfReader .pages )
9696 if maxpages :
9797 num_pages = min (num_pages , maxpages )
9898 for i in range (num_pages ):
99- pageObj = pdfReader .getPage ( i )
100- pdf_data += pageObj .extractText ()
99+ pageObj = pdfReader .pages [ i ]
100+ pdf_data += pageObj .extract_text ()
101101 return pdf_data
102102
103103 def get_toc (self , filepath , password = None ):
104104 outlines = []
105105
106106 with open (filepath , "rb" ) as fp :
107- pdfReader = PyPDF2 . PdfFileReader (fp , strict = False )
107+ pdfReader = pypdf . PdfReader (fp )
108108 if password :
109109 pdfReader .decrypt (password )
110- outlines = pdfReader .getOutlines ()
110+ outlines = pdfReader .outline
111111 if outlines :
112112 outlines = str (outlines ).replace ("IndirectObject(" , "[" )
113113 outlines = outlines .replace (")" , "]" ).replace ("/" , "" )
0 commit comments