Skip to content

Commit b562ddc

Browse files
authored
Merge pull request #429 from seleniumbase/update-pdf-methods
Add get_pdf_text() and use that in assert_pdf_text()
2 parents 4a23ba5 + 1d1fb19 commit b562ddc

File tree

5 files changed

+48
-17
lines changed

5 files changed

+48
-17
lines changed

examples/test_get_pdf_text.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from seleniumbase import BaseCase
2+
3+
4+
class PdfTestClass(BaseCase):
5+
6+
def test_get_pdf_text(self):
7+
pdf = ("https://nostarch.com/download/"
8+
"Automate_the_Boring_Stuff_sample_ch17.pdf")
9+
pdf_text = self.get_pdf_text(pdf, page=1)
10+
print(pdf_text)

help_docs/method_summary.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ self.assert_no_404_errors(multithreaded=True)
219219

220220
self.print_unique_links_with_status_codes()
221221

222+
self.get_pdf_text(pdf, page=None)
223+
222224
self.assert_pdf_text(pdf, text, page=None)
223225

224226
self.create_folder(folder)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pip>=19.3.1
2-
setuptools>=41.6.0
2+
setuptools>=42.0.0
33
setuptools-scm>=3.3.3
44
wheel>=0.33.6
55
six>=1.13.0

seleniumbase/fixtures/base_case.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1925,17 +1925,20 @@ def print_unique_links_with_status_codes(self):
19251925
soup = self.get_beautiful_soup(self.get_page_source())
19261926
page_utils._print_unique_links_with_status_codes(page_url, soup)
19271927

1928-
def assert_pdf_text(self, pdf, text, page=None):
1929-
""" Asserts text in a PDF file.
1928+
def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
1929+
import PyPDF2
1930+
pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
1931+
return pdf_reader_object
1932+
1933+
def get_pdf_text(self, pdf, page=None):
1934+
""" Gets text from a PDF file.
19301935
PDF can be either a URL or a file path on the local file system.
19311936
@Params
19321937
pdf - The URL or file path of the PDF file.
1933-
text - The expected text to verify in the PDF.
19341938
page - The page number of the PDF to use (optional).
19351939
If a page number is provided, looks only at that page.
19361940
(1 is the first page, 2 is the second page, etc.)
1937-
If no page number is provided, looks at all the pages. """
1938-
import PyPDF2
1941+
If no page number is provided, returns all PDF text. """
19391942
if not pdf.lower().endswith('.pdf'):
19401943
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
19411944
file_path = None
@@ -1950,25 +1953,41 @@ def assert_pdf_text(self, pdf, text, page=None):
19501953
raise Exception("%s is not a valid URL or file path!" % pdf)
19511954
file_path = os.path.abspath(pdf)
19521955
pdf_file_object = open(file_path, "rb")
1953-
pdf_reader = PyPDF2.PdfFileReader(pdf_file_object, strict=False)
1956+
pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
19541957
num_pages = pdf_reader.numPages
1958+
pdf_text = ""
19551959
if type(page) is int:
19561960
if page > num_pages:
19571961
raise Exception("Invalid page number for the PDF!")
19581962
page = page - 1
19591963
page_obj = pdf_reader.getPage(page)
1960-
pdf_page_text = page_obj.extractText()
1961-
if text not in pdf_page_text:
1962-
raise Exception("PDF [%s] is missing expected text [%s] on "
1963-
"page [%s]!" % (file_path, text, page))
1964+
pdf_text = page_obj.extractText()
19641965
else:
19651966
for page_num in range(num_pages):
19661967
page_obj = pdf_reader.getPage(page_num)
1967-
pdf_page_text = page_obj.extractText()
1968-
if text in pdf_page_text:
1969-
return
1970-
raise Exception("PDF [%s] is missing expected text [%s]!"
1971-
"" % (file_path, text))
1968+
pdf_text = pdf_text + '\n' + page_obj.extractText()
1969+
return pdf_text
1970+
1971+
def assert_pdf_text(self, pdf, text, page=None):
1972+
""" Asserts text in a PDF file.
1973+
PDF can be either a URL or a file path on the local file system.
1974+
@Params
1975+
pdf - The URL or file path of the PDF file.
1976+
text - The expected text to verify in the PDF.
1977+
page - The page number of the PDF to use (optional).
1978+
If a page number is provided, looks only at that page.
1979+
(1 is the first page, 2 is the second page, etc.)
1980+
If no page number is provided, looks at all the pages. """
1981+
pdf_text = self.get_pdf_text(pdf, page=page)
1982+
if type(page) is int:
1983+
if text not in pdf_text:
1984+
raise Exception("PDF [%s] is missing expected text [%s] on "
1985+
"page [%s]!" % (pdf, text, page))
1986+
else:
1987+
if text not in pdf_text:
1988+
raise Exception("PDF [%s] is missing expected text [%s]!"
1989+
"" % (pdf, text))
1990+
return True
19721991

19731992
def create_folder(self, folder):
19741993
""" Creates a folder of the given name if it doesn't already exist. """

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
setup(
4747
name='seleniumbase',
48-
version='1.33.4',
48+
version='1.33.5',
4949
description='Fast, Easy, and Reliable Browser Automation & Testing.',
5050
long_description=long_description,
5151
long_description_content_type='text/markdown',

0 commit comments

Comments
 (0)