Merge pull request #435 from seleniumbase/handle-pdfs-with-chinese-characters

mdmintz · web-flow · commit 360aba3d3814 · 2019-11-29T00:55:53.000-05:00
Allow the use of Chinese characters in PDF testing
diff --git a/examples/test_chinese_pdf.py b/examples/test_chinese_pdf.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+from seleniumbase import BaseCase
+
+
+class ChinesePdfTestClass(BaseCase):
+
+    def test_chinese_pdf(self):
+
+        pdf = ('https://github.com/seleniumbase/SeleniumBase/'
+               'files/3895614/unittest.pdf')
+
+        # Get and print PDF text
+        pdf_text = self.get_pdf_text(pdf, page=2)
+        print("\n" + pdf_text)
+
+        # Assert PDF contains the expected text on Page 2
+        self.assert_pdf_text(pdf, "个测试类", page=2)
+
+        # Assert PDF contains the expected text on any of the pages
+        self.assert_pdf_text(pdf, "运行单元测试")
+        self.assert_pdf_text(pdf, "等待测试结束后显示所有结果")
+        self.assert_pdf_text(pdf, "测试的执行跟方法的顺序没有关系")
diff --git a/examples/test_get_pdf_text.py b/examples/test_get_pdf_text.py
@@ -7,4 +7,4 @@ def test_get_pdf_text(self):
         pdf = ("https://nostarch.com/download/"
                "Automate_the_Boring_Stuff_sample_ch17.pdf")
         pdf_text = self.get_pdf_text(pdf, page=1)
-        print(pdf_text)
+        print("\n" + pdf_text)
diff --git a/help_docs/method_summary.md b/help_docs/method_summary.md
@@ -221,9 +221,11 @@ self.assert_no_404_errors(multithreaded=True)
 
 self.print_unique_links_with_status_codes()
 
-self.get_pdf_text(pdf, page=None)
+self.get_pdf_text(pdf, page=None, maxpages=None, password=None,
+                  codec='utf-8', wrap=False, nav=False, override=False)
 
-self.assert_pdf_text(pdf, text, page=None)
+self.assert_pdf_text(pdf, text, page=None, maxpages=None, password=None,
+                     codec='utf-8', wrap=True, nav=False, override=False)
 
 self.create_folder(folder)
 
diff --git a/pytest.ini b/pytest.ini
@@ -6,6 +6,9 @@ addopts = --capture=no --ignore conftest.py -p no:cacheprovider
 # Ignore warnings such as DeprecationWarning and pytest.PytestUnknownMarkWarning
 filterwarnings = ignore::pytest.PytestWarning
 
+# Configure the junit_family option explicitly:
+junit_family = legacy
+
 # Set pytest discovery rules:
 # (Most of the rules here are similar to the default rules.)
 # (unittest.TestCase rules override the rules here for classes and functions.)
diff --git a/requirements.txt b/requirements.txt
@@ -16,7 +16,8 @@ pytest>=4.6.6;python_version<"3"
 pytest>=5.3.1;python_version>="3"
 pytest-cov>=2.8.1
 pytest-forked>=1.1.3
-pytest-html==1.22.0
+pytest-html==1.22.1;python_version<"3.6"
+pytest-html==2.0.1;python_version>="3.6"
 pytest-metadata>=1.8.0
 pytest-ordering>=0.6
 pytest-rerunfailures>=8.0
@@ -31,10 +32,10 @@ asn1crypto>=1.2.0
 pyopenssl>=19.1.0
 colorama>=0.4.1
 pymysql>=0.9.3
-pypdf2>=1.26.0
 pyotp>=2.3.0
 boto>=2.49.0
 cffi>=1.13.2
 tqdm>=4.39.0
 flake8>=3.7.9
 certifi>=2019.9.11
+pdfminer.six==20191110
diff --git a/seleniumbase/console_scripts/sb_mkdir.py b/seleniumbase/console_scripts/sb_mkdir.py
@@ -55,6 +55,7 @@ def main():
         data.append("addopts = --capture=no --ignore conftest.py "
                     "-p no:cacheprovider")
         data.append("filterwarnings = ignore::pytest.PytestWarning")
+        data.append("junit_family = legacy")
         data.append("python_files = test_*.py *_test.py *_tests.py *_suite.py")
         data.append("python_classes = Test* *Test* *Test *Tests *Suite")
         data.append("python_functions = test_*")
diff --git a/seleniumbase/fixtures/base_case.py b/seleniumbase/fixtures/base_case.py
@@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
         soup = self.get_beautiful_soup(self.get_page_source())
         page_utils._print_unique_links_with_status_codes(page_url, soup)
 
-    def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
-        import PyPDF2
-        pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
-        return pdf_reader_object
-
-    def get_pdf_text(self, pdf, page=None):
+    def __fix_unicode_conversion(self, text):
+        """ Fixing Chinese characters when converting from PDF to HTML. """
+        text = text.replace(u'\u2f8f', u'\u884c')
+        text = text.replace(u'\u2f45', u'\u65b9')
+        text = text.replace(u'\u2f08', u'\u4eba')
+        text = text.replace(u'\u2f70', u'\u793a')
+        return text
+
+    def get_pdf_text(self, pdf, page=None, maxpages=None,
+                     password=None, codec='utf-8', wrap=False, nav=False,
+                     override=False):
         """ Gets text from a PDF file.
             PDF can be either a URL or a file path on the local file system.
             @Params
             pdf - The URL or file path of the PDF file.
-            page - The page number of the PDF to use (optional).
+            page - The page number (or a list of page numbers) of the PDF.
                     If a page number is provided, looks only at that page.
                         (1 is the first page, 2 is the second page, etc.)
-                    If no page number is provided, returns all PDF text. """
+                    If no page number is provided, returns all PDF text.
+            maxpages - Instead of providing a page number, you can provide
+                       the number of pages to use from the beginning.
+            password - If the PDF is password-protected, enter it here.
+            codec - The compression format for character encoding.
+                    (The default codec used by this method is 'utf-8'.)
+            wrap - Replaces ' \n' with ' ' so that individual sentences
+                   from a PDF don't get broken up into seperate lines when
+                   getting converted into text format.
+            nav - If PDF is a URL, navigates to the URL in the browser first.
+                  (Not needed because the PDF will be downloaded anyway.)
+            override - If the PDF file to be downloaded already exists in the
+                       downloaded_files/ folder, that PDF will be used
+                       instead of downloading it again. """
+        from pdfminer.high_level import extract_text
+        if not password:
+            password = ''
+        if not maxpages:
+            maxpages = 0
         if not pdf.lower().endswith('.pdf'):
             raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
         file_path = None
         if page_utils.is_valid_url(pdf):
-            if self.get_current_url() != pdf:
-                self.open(pdf)
-            self.download_file(pdf)
+            if nav:
+                if self.get_current_url() != pdf:
+                    self.open(pdf)
             file_name = pdf.split('/')[-1]
             file_path = self.get_downloads_folder() + '/' + file_name
+            if not os.path.exists(file_path):
+                self.download_file(pdf)
+            elif override:
+                self.download_file(pdf)
         else:
             if not os.path.exists(pdf):
                 raise Exception("%s is not a valid URL or file path!" % pdf)
             file_path = os.path.abspath(pdf)
-        pdf_file_object = open(file_path, "rb")
-        pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
-        num_pages = pdf_reader.numPages
-        pdf_text = ""
-        if type(page) is int:
-            if page > num_pages:
-                raise Exception("Invalid page number for the PDF!")
+        page_search = None  # (Pages are delimited by '\x0c')
+        if type(page) is list:
+            pages = page
+            page_search = []
+            for page in pages:
+                page_search.append(page - 1)
+        elif type(page) is int:
             page = page - 1
-            page_obj = pdf_reader.getPage(page)
-            pdf_text = page_obj.extractText()
+            if page < 0:
+                page = 0
+            page_search = [page]
         else:
-            for page_num in range(num_pages):
-                page_obj = pdf_reader.getPage(page_num)
-                pdf_text = pdf_text + '\n' + page_obj.extractText()
+            page_search = None
+        pdf_text = extract_text(
+            file_path, password='', page_numbers=page_search,
+            maxpages=maxpages, caching=False, codec=codec)
+        pdf_text = self.__fix_unicode_conversion(pdf_text)
+        if wrap:
+            pdf_text = pdf_text.replace(' \n', ' ')
         return pdf_text
 
-    def assert_pdf_text(self, pdf, text, page=None):
+    def assert_pdf_text(self, pdf, text, page=None, maxpages=None,
+                        password=None, codec='utf-8', wrap=True, nav=False,
+                        override=False):
         """ Asserts text in a PDF file.
             PDF can be either a URL or a file path on the local file system.
             @Params
@@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
             page - The page number of the PDF to use (optional).
                     If a page number is provided, looks only at that page.
                         (1 is the first page, 2 is the second page, etc.)
-                    If no page number is provided, looks at all the pages. """
-        pdf_text = self.get_pdf_text(pdf, page=page)
+                    If no page number is provided, looks at all the pages.
+            maxpages - Instead of providing a page number, you can provide
+                       the number of pages to use from the beginning.
+            password - If the PDF is password-protected, enter it here.
+            codec - The compression format for character encoding.
+                    (The default codec used by this method is 'utf-8'.)
+            wrap - Replaces ' \n' with ' ' so that individual sentences
+                   from a PDF don't get broken up into seperate lines when
+                   getting converted into text format.
+            nav - If PDF is a URL, navigates to the URL in the browser first.
+                  (Not needed because the PDF will be downloaded anyway.)
+            override - If the PDF file to be downloaded already exists in the
+                       downloaded_files/ folder, that PDF will be used
+                       instead of downloading it again. """
+        text = self.__fix_unicode_conversion(text)
+        if not codec:
+            codec = 'utf-8'
+        pdf_text = self.get_pdf_text(
+            pdf, page=page, maxpages=maxpages, password=password, codec=codec,
+            wrap=wrap, nav=nav, override=override)
         if type(page) is int:
             if text not in pdf_text:
                 raise Exception("PDF [%s] is missing expected text [%s] on "
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
 
 setup(
     name='seleniumbase',
-    version='1.33.7',
+    version='1.33.8',
     description='Fast, Easy, and Reliable Browser Automation & Testing.',
     long_description=long_description,
     long_description_content_type='text/markdown',
@@ -99,7 +99,8 @@
         'pytest>=5.3.1;python_version>="3"',
         'pytest-cov>=2.8.1',
         'pytest-forked>=1.1.3',
-        'pytest-html==1.22.0',  # Keep at 1.22.0 unless tested on Windows
+        'pytest-html==1.22.1;python_version<"3.6"',
+        'pytest-html==2.0.1;python_version>="3.6"',
         'pytest-metadata>=1.8.0',
         'pytest-ordering>=0.6',
         'pytest-rerunfailures>=8.0',
@@ -114,13 +115,13 @@
         'pyopenssl>=19.1.0',
         'colorama>=0.4.1',
         'pymysql>=0.9.3',
-        'pypdf2>=1.26.0',
         'pyotp>=2.3.0',
         'boto>=2.49.0',
         'cffi>=1.13.2',
         'tqdm>=4.39.0',
         'flake8>=3.7.9',
         'certifi>=2019.9.11',
+        'pdfminer.six==20191110',
     ],
     packages=[
         'seleniumbase',