Skip to content

Commit ee5ec9d

Browse files
committed
Add get_pdf_text() and use that in assert_pdf_text()
1 parent 4a23ba5 commit ee5ec9d

File tree

1 file changed

+34
-15
lines changed

1 file changed

+34
-15
lines changed

seleniumbase/fixtures/base_case.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1925,17 +1925,20 @@ def print_unique_links_with_status_codes(self):
19251925
soup = self.get_beautiful_soup(self.get_page_source())
19261926
page_utils._print_unique_links_with_status_codes(page_url, soup)
19271927

1928-
def assert_pdf_text(self, pdf, text, page=None):
1929-
""" Asserts text in a PDF file.
1928+
def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
1929+
import PyPDF2
1930+
pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
1931+
return pdf_reader_object
1932+
1933+
def get_pdf_text(self, pdf, page=None):
1934+
""" Gets text from a PDF file.
19301935
PDF can be either a URL or a file path on the local file system.
19311936
@Params
19321937
pdf - The URL or file path of the PDF file.
1933-
text - The expected text to verify in the PDF.
19341938
page - The page number of the PDF to use (optional).
19351939
If a page number is provided, looks only at that page.
19361940
(1 is the first page, 2 is the second page, etc.)
1937-
If no page number is provided, looks at all the pages. """
1938-
import PyPDF2
1941+
If no page number is provided, returns all PDF text. """
19391942
if not pdf.lower().endswith('.pdf'):
19401943
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
19411944
file_path = None
@@ -1950,25 +1953,41 @@ def assert_pdf_text(self, pdf, text, page=None):
19501953
raise Exception("%s is not a valid URL or file path!" % pdf)
19511954
file_path = os.path.abspath(pdf)
19521955
pdf_file_object = open(file_path, "rb")
1953-
pdf_reader = PyPDF2.PdfFileReader(pdf_file_object, strict=False)
1956+
pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
19541957
num_pages = pdf_reader.numPages
1958+
pdf_text = ""
19551959
if type(page) is int:
19561960
if page > num_pages:
19571961
raise Exception("Invalid page number for the PDF!")
19581962
page = page - 1
19591963
page_obj = pdf_reader.getPage(page)
1960-
pdf_page_text = page_obj.extractText()
1961-
if text not in pdf_page_text:
1962-
raise Exception("PDF [%s] is missing expected text [%s] on "
1963-
"page [%s]!" % (file_path, text, page))
1964+
pdf_text = page_obj.extractText()
19641965
else:
19651966
for page_num in range(num_pages):
19661967
page_obj = pdf_reader.getPage(page_num)
1967-
pdf_page_text = page_obj.extractText()
1968-
if text in pdf_page_text:
1969-
return
1970-
raise Exception("PDF [%s] is missing expected text [%s]!"
1971-
"" % (file_path, text))
1968+
pdf_text = pdf_text + '\n' + page_obj.extractText()
1969+
return pdf_text
1970+
1971+
def assert_pdf_text(self, pdf, text, page=None):
1972+
""" Asserts text in a PDF file.
1973+
PDF can be either a URL or a file path on the local file system.
1974+
@Params
1975+
pdf - The URL or file path of the PDF file.
1976+
text - The expected text to verify in the PDF.
1977+
page - The page number of the PDF to use (optional).
1978+
If a page number is provided, looks only at that page.
1979+
(1 is the first page, 2 is the second page, etc.)
1980+
If no page number is provided, looks at all the pages. """
1981+
pdf_text = self.get_pdf_text(pdf, page=page)
1982+
if type(page) is int:
1983+
if text not in pdf_text:
1984+
raise Exception("PDF [%s] is missing expected text [%s] on "
1985+
"page [%s]!" % (pdf, text, page))
1986+
else:
1987+
if text not in pdf_text:
1988+
raise Exception("PDF [%s] is missing expected text [%s]!"
1989+
"" % (pdf, text))
1990+
return True
19721991

19731992
def create_folder(self, folder):
19741993
""" Creates a folder of the given name if it doesn't already exist. """

0 commit comments

Comments
 (0)