@@ -1925,17 +1925,20 @@ def print_unique_links_with_status_codes(self):
1925
1925
soup = self .get_beautiful_soup (self .get_page_source ())
1926
1926
page_utils ._print_unique_links_with_status_codes (page_url , soup )
1927
1927
1928
- def assert_pdf_text (self , pdf , text , page = None ):
1929
- """ Asserts text in a PDF file.
1928
+ def __get_pdf_reader_obj (self , pdf_file_object , strict = False ):
1929
+ import PyPDF2
1930
+ pdf_reader_object = PyPDF2 .PdfFileReader (pdf_file_object , strict )
1931
+ return pdf_reader_object
1932
+
1933
+ def get_pdf_text (self , pdf , page = None ):
1934
+ """ Gets text from a PDF file.
1930
1935
PDF can be either a URL or a file path on the local file system.
1931
1936
@Params
1932
1937
pdf - The URL or file path of the PDF file.
1933
- text - The expected text to verify in the PDF.
1934
1938
page - The page number of the PDF to use (optional).
1935
1939
If a page number is provided, looks only at that page.
1936
1940
(1 is the first page, 2 is the second page, etc.)
1937
- If no page number is provided, looks at all the pages. """
1938
- import PyPDF2
1941
+ If no page number is provided, returns all PDF text. """
1939
1942
if not pdf .lower ().endswith ('.pdf' ):
1940
1943
raise Exception ("%s is not a PDF file! (Expecting a .pdf)" % pdf )
1941
1944
file_path = None
@@ -1950,25 +1953,41 @@ def assert_pdf_text(self, pdf, text, page=None):
1950
1953
raise Exception ("%s is not a valid URL or file path!" % pdf )
1951
1954
file_path = os .path .abspath (pdf )
1952
1955
pdf_file_object = open (file_path , "rb" )
1953
- pdf_reader = PyPDF2 . PdfFileReader (pdf_file_object , strict = False )
1956
+ pdf_reader = self . __get_pdf_reader_obj (pdf_file_object , strict = False )
1954
1957
num_pages = pdf_reader .numPages
1958
+ pdf_text = ""
1955
1959
if type (page ) is int :
1956
1960
if page > num_pages :
1957
1961
raise Exception ("Invalid page number for the PDF!" )
1958
1962
page = page - 1
1959
1963
page_obj = pdf_reader .getPage (page )
1960
- pdf_page_text = page_obj .extractText ()
1961
- if text not in pdf_page_text :
1962
- raise Exception ("PDF [%s] is missing expected text [%s] on "
1963
- "page [%s]!" % (file_path , text , page ))
1964
+ pdf_text = page_obj .extractText ()
1964
1965
else :
1965
1966
for page_num in range (num_pages ):
1966
1967
page_obj = pdf_reader .getPage (page_num )
1967
- pdf_page_text = page_obj .extractText ()
1968
- if text in pdf_page_text :
1969
- return
1970
- raise Exception ("PDF [%s] is missing expected text [%s]!"
1971
- "" % (file_path , text ))
1968
+ pdf_text = pdf_text + '\n ' + page_obj .extractText ()
1969
+ return pdf_text
1970
+
1971
+ def assert_pdf_text (self , pdf , text , page = None ):
1972
+ """ Asserts text in a PDF file.
1973
+ PDF can be either a URL or a file path on the local file system.
1974
+ @Params
1975
+ pdf - The URL or file path of the PDF file.
1976
+ text - The expected text to verify in the PDF.
1977
+ page - The page number of the PDF to use (optional).
1978
+ If a page number is provided, looks only at that page.
1979
+ (1 is the first page, 2 is the second page, etc.)
1980
+ If no page number is provided, looks at all the pages. """
1981
+ pdf_text = self .get_pdf_text (pdf , page = page )
1982
+ if type (page ) is int :
1983
+ if text not in pdf_text :
1984
+ raise Exception ("PDF [%s] is missing expected text [%s] on "
1985
+ "page [%s]!" % (pdf , text , page ))
1986
+ else :
1987
+ if text not in pdf_text :
1988
+ raise Exception ("PDF [%s] is missing expected text [%s]!"
1989
+ "" % (pdf , text ))
1990
+ return True
1972
1991
1973
1992
def create_folder (self , folder ):
1974
1993
""" Creates a folder of the given name if it doesn't already exist. """
0 commit comments