@@ -1856,6 +1856,51 @@ def print_unique_links_with_status_codes(self):
1856
1856
soup = self .get_beautiful_soup (self .get_page_source ())
1857
1857
page_utils ._print_unique_links_with_status_codes (page_url , soup )
1858
1858
1859
+ def assert_pdf_text (self , pdf , text , page = None ):
1860
+ """ Asserts text in a PDF file.
1861
+ PDF can be either a URL or a file path on the local file system.
1862
+ @Params
1863
+ pdf - The URL or file path of the PDF file.
1864
+ text - The expected text to verify in the PDF.
1865
+ page - The page number of the PDF to use (optional).
1866
+ If a page number is provided, looks only at that page.
1867
+ (1 is the first page, 2 is the second page, etc.)
1868
+ If no page number is provided, looks at all the pages. """
1869
+ import PyPDF2
1870
+ if not pdf .lower ().endswith ('.pdf' ):
1871
+ raise Exception ("%s is not a PDF file! (Expecting a .pdf)" % pdf )
1872
+ file_path = None
1873
+ if page_utils .is_valid_url (pdf ):
1874
+ if self .get_current_url () != pdf :
1875
+ self .open (pdf )
1876
+ self .download_file (pdf )
1877
+ file_name = pdf .split ('/' )[- 1 ]
1878
+ file_path = self .get_downloads_folder () + '/' + file_name
1879
+ else :
1880
+ if not os .path .exists (pdf ):
1881
+ raise Exception ("%s is not a valid URL or file path!" % pdf )
1882
+ file_path = os .path .abspath (pdf )
1883
+ pdf_file_object = open (file_path , "rb" )
1884
+ pdf_reader = PyPDF2 .PdfFileReader (pdf_file_object , strict = False )
1885
+ num_pages = pdf_reader .numPages
1886
+ if type (page ) is int :
1887
+ if page > num_pages :
1888
+ raise Exception ("Invalid page number for the PDF!" )
1889
+ page = page - 1
1890
+ page_obj = pdf_reader .getPage (page )
1891
+ pdf_page_text = page_obj .extractText ()
1892
+ if text not in pdf_page_text :
1893
+ raise Exception ("PDF [%s] is missing expected text [%s] on "
1894
+ "page [%s]!" % (file_path , text , page ))
1895
+ else :
1896
+ for page_num in range (num_pages ):
1897
+ page_obj = pdf_reader .getPage (page_num )
1898
+ pdf_page_text = page_obj .extractText ()
1899
+ if text in pdf_page_text :
1900
+ return
1901
+ raise Exception ("PDF [%s] is missing expected text [%s]!"
1902
+ "" % (file_path , text ))
1903
+
1859
1904
def create_folder (self , folder ):
1860
1905
""" Creates a folder of the given name if it doesn't already exist. """
1861
1906
if folder .endswith ("/" ):
0 commit comments