Skip to content

Commit 6a955ed

Browse files
Merge pull request #88 from PRAZPC/dev
Migrate from PyPDF2 to pypdf and remove obsolete mobi_to_json test
2 parents d6d3630 + e938d54 commit 6a955ed

File tree

6 files changed

+17
-17
lines changed

6 files changed

+17
-17
lines changed

audiobook/doc_parser/pdf_parser.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import io
22
import ast
33

4-
import PyPDF2
4+
import pypdf
55

66
from pdfminer.pdfinterp import PDFResourceManager
77
from pdfminer.pdfinterp import PDFPageInterpreter
@@ -69,9 +69,9 @@ def get_toc(self, filepath, password=None):
6969
return output_toc
7070

7171

72-
class PyPDF2DocParser(object):
72+
class PyPDFDocParser(object):
7373
"""
74-
PyPdf2 Doc Parser:
74+
pypdf Doc Parser:
7575
7676
methods:
7777
1. get_metadata : get metadata of pdf file
@@ -89,25 +89,25 @@ def get_text(self, filepath, password=None, maxpages=0):
8989
""" function to read all the text from pdf file """
9090
pdf_data = ""
9191
with open(filepath, "rb") as fp:
92-
pdfReader = PyPDF2.PdfFileReader(fp)
92+
pdfReader = pypdf.PdfReader(fp)
9393
if password:
9494
pdfReader.decrypt(password)
95-
num_pages = pdfReader.numPages
95+
num_pages = len(pdfReader.pages)
9696
if maxpages:
9797
num_pages = min(num_pages, maxpages)
9898
for i in range(num_pages):
99-
pageObj = pdfReader.getPage(i)
100-
pdf_data += pageObj.extractText()
99+
pageObj = pdfReader.pages[i]
100+
pdf_data += pageObj.extract_text()
101101
return pdf_data
102102

103103
def get_toc(self, filepath, password=None):
104104
outlines = []
105105

106106
with open(filepath, "rb") as fp:
107-
pdfReader = PyPDF2.PdfFileReader(fp, strict=False)
107+
pdfReader = pypdf.PdfReader(fp)
108108
if password:
109109
pdfReader.decrypt(password)
110-
outlines = pdfReader.getOutlines()
110+
outlines = pdfReader.outline
111111
if outlines:
112112
outlines = str(outlines).replace("IndirectObject(", "[")
113113
outlines = outlines.replace(")", "]").replace("/", "")

audiobook/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
)
1313
from audiobook.utils import get_json_metadata
1414

15-
logger = logging.getLogger("PyPDF2")
15+
logger = logging.getLogger("pypdf")
1616
logger.setLevel(logging.INFO)
1717

1818
expand_usr = os.path.expanduser("~")

audiobook/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from odf.opendocument import load
1010
from striprtf.striprtf import rtf_to_text
1111
from audiobook.doc_parser.web_parser import ArticleWebScraper
12-
from audiobook.doc_parser.pdf_parser import PyPDF2DocParser
12+
from audiobook.doc_parser.pdf_parser import PyPDFDocParser
1313

1414
# Helper function to load JSON data from a file
1515
def load_json(filename):
@@ -47,7 +47,7 @@ def pdf_to_json(input_book_path, password=None):
4747
metadata = {}
4848
basename = os.path.basename(input_book_path).split(".")[0]
4949

50-
pdf_parser = PyPDF2DocParser()
50+
pdf_parser = PyPDFDocParser()
5151
text = pdf_parser.get_text(input_book_path, password=password)
5252
text = text_preprocessing(text)
5353

docs/command_line_usage.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Support Format and extraction method
2020
=========== ================== ===============
2121
File Format Supported extraction_engine
2222
=========== ================== ===============
23-
PDF ✅ pypdf2/pdfminor
23+
PDF ✅ pypdf/pdfminer
2424
TXT ✅ default set
2525
EPUB ✅ default set
2626
MOBI ✅ default set

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pyttsx3==2.98
2-
PyPDF2==3.0.1
2+
pypdf==4.0.1
33
ebooklib==0.19
44
beautifulsoup4==4.13.4
55
html2text==2025.4.15

tests/test_create_json_book.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ def test_txt_to_json_pdf_miner(self):
2323
# def test_pdf_to_json_pdf_miner(self): # pdfminer support added
2424
# self.assertEqual(ab.create_json_book("assets/sample.pdf"), output_txt)
2525

26-
def test_pdf_to_json_pypdf2(self):
26+
def test_pdf_to_json_pypdf(self):
2727
self.assertEqual(ab.create_json_book("assets/sample.pdf"), output_txt)
2828

2929
def test_odt_to_json(self):
3030
self.assertEqual(ab.create_json_book("assets/sample.odt"), output_txt)
3131

32-
def test_mobi_to_json(self):
33-
self.assertEqual(ab.create_json_book("assets/sample.mobi"), output_txt)
32+
# def test_mobi_to_json(self):
33+
# self.assertEqual(ab.create_json_book("assets/sample.mobi"), output_txt)
3434

3535
# def test_docs_to_json(self):
3636
# self.assertEqual(ab.create_json_book("assets/sample.doc"), (output['docs'], {'book_name': 'sample', 'pages': 1}))

0 commit comments

Comments
 (0)