Skip to content

Commit b478eb3

Browse files
authored
Merge pull request #425 from seleniumbase/pdf-testing
Add the assert_pdf_text() method for asserting PDF text
2 parents acd0b6b + 530ac80 commit b478eb3

File tree

5 files changed

+67
-2
lines changed

5 files changed

+67
-2
lines changed

examples/test_pdf_asserts.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from seleniumbase import BaseCase
2+
3+
4+
class PdfTestClass(BaseCase):
5+
6+
def test_assert_pdf_text(self):
7+
8+
# Assert PDF contains the expected text on Page 1
9+
self.assert_pdf_text(
10+
"https://nostarch.com/download/Automate_the_Boring_Stuff_dTOC.pdf",
11+
"Programming Is a Creative Activity", page=1)
12+
13+
# Assert PDF contains the expected text on any of the pages
14+
self.assert_pdf_text(
15+
"https://nostarch.com/download/Automate_the_Boring_Stuff_dTOC.pdf",
16+
"Extracting Text from PDFs")

help_docs/method_summary.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,8 @@ self.assert_no_404_errors(multithreaded=True)
211211

212212
self.print_unique_links_with_status_codes()
213213

214+
self.assert_pdf_text(pdf, text, page=None)
215+
214216
self.create_folder(folder)
215217

216218
self.choose_file(selector, file_path, by=By.CSS_SELECTOR, timeout=None)
@@ -223,7 +225,7 @@ self.save_file_as(file_url, new_file_name, destination_folder=None)
223225

224226
self.save_data_as(data, file_name, destination_folder=None)
225227

226-
self.get_downloads_folder(file)
228+
self.get_downloads_folder()
227229

228230
self.get_path_of_downloaded_file(file)
229231

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ asn1crypto>=1.2.0
3131
pyopenssl>=19.1.0
3232
colorama>=0.4.1
3333
pymysql>=0.9.3
34+
pypdf2>=1.26.0
3435
pyotp>=2.3.0
3536
boto>=2.49.0
3637
cffi>=1.13.2

seleniumbase/fixtures/base_case.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1856,6 +1856,51 @@ def print_unique_links_with_status_codes(self):
18561856
soup = self.get_beautiful_soup(self.get_page_source())
18571857
page_utils._print_unique_links_with_status_codes(page_url, soup)
18581858

1859+
def assert_pdf_text(self, pdf, text, page=None):
1860+
""" Asserts text in a PDF file.
1861+
PDF can be either a URL or a file path on the local file system.
1862+
@Params
1863+
pdf - The URL or file path of the PDF file.
1864+
text - The expected text to verify in the PDF.
1865+
page - The page number of the PDF to use (optional).
1866+
If a page number is provided, looks only at that page.
1867+
(1 is the first page, 2 is the second page, etc.)
1868+
If no page number is provided, looks at all the pages. """
1869+
import PyPDF2
1870+
if not pdf.lower().endswith('.pdf'):
1871+
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
1872+
file_path = None
1873+
if page_utils.is_valid_url(pdf):
1874+
if self.get_current_url() != pdf:
1875+
self.open(pdf)
1876+
self.download_file(pdf)
1877+
file_name = pdf.split('/')[-1]
1878+
file_path = self.get_downloads_folder() + '/' + file_name
1879+
else:
1880+
if not os.path.exists(pdf):
1881+
raise Exception("%s is not a valid URL or file path!" % pdf)
1882+
file_path = os.path.abspath(pdf)
1883+
pdf_file_object = open(file_path, "rb")
1884+
pdf_reader = PyPDF2.PdfFileReader(pdf_file_object, strict=False)
1885+
num_pages = pdf_reader.numPages
1886+
if type(page) is int:
1887+
if page > num_pages:
1888+
raise Exception("Invalid page number for the PDF!")
1889+
page = page - 1
1890+
page_obj = pdf_reader.getPage(page)
1891+
pdf_page_text = page_obj.extractText()
1892+
if text not in pdf_page_text:
1893+
raise Exception("PDF [%s] is missing expected text [%s] on "
1894+
"page [%s]!" % (file_path, text, page))
1895+
else:
1896+
for page_num in range(num_pages):
1897+
page_obj = pdf_reader.getPage(page_num)
1898+
pdf_page_text = page_obj.extractText()
1899+
if text in pdf_page_text:
1900+
return
1901+
raise Exception("PDF [%s] is missing expected text [%s]!"
1902+
"" % (file_path, text))
1903+
18591904
def create_folder(self, folder):
18601905
""" Creates a folder of the given name if it doesn't already exist. """
18611906
if folder.endswith("/"):

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
setup(
4747
name='seleniumbase',
48-
version='1.33.1',
48+
version='1.33.2',
4949
description='Fast, Easy, and Reliable Browser Automation & Testing.',
5050
long_description=long_description,
5151
long_description_content_type='text/markdown',
@@ -114,6 +114,7 @@
114114
'pyopenssl>=19.1.0',
115115
'colorama>=0.4.1',
116116
'pymysql>=0.9.3',
117+
'pypdf2>=1.26.0',
117118
'pyotp>=2.3.0',
118119
'boto>=2.49.0',
119120
'cffi>=1.13.2',

0 commit comments

Comments
 (0)