Skip to content

Commit 360aba3

Browse files
authored
Merge pull request #435 from seleniumbase/handle-pdfs-with-chinese-characters
Allow the use of Chinese characters in PDF testing
2 parents 1441669 + 7452015 commit 360aba3

File tree

8 files changed

+116
-34
lines changed

8 files changed

+116
-34
lines changed

examples/test_chinese_pdf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
from seleniumbase import BaseCase
3+
4+
5+
class ChinesePdfTestClass(BaseCase):
6+
7+
def test_chinese_pdf(self):
8+
9+
pdf = ('https://github.com/seleniumbase/SeleniumBase/'
10+
'files/3895614/unittest.pdf')
11+
12+
# Get and print PDF text
13+
pdf_text = self.get_pdf_text(pdf, page=2)
14+
print("\n" + pdf_text)
15+
16+
# Assert PDF contains the expected text on Page 2
17+
self.assert_pdf_text(pdf, "个测试类", page=2)
18+
19+
# Assert PDF contains the expected text on any of the pages
20+
self.assert_pdf_text(pdf, "运行单元测试")
21+
self.assert_pdf_text(pdf, "等待测试结束后显示所有结果")
22+
self.assert_pdf_text(pdf, "测试的执行跟方法的顺序没有关系")

examples/test_get_pdf_text.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ def test_get_pdf_text(self):
77
pdf = ("https://nostarch.com/download/"
88
"Automate_the_Boring_Stuff_sample_ch17.pdf")
99
pdf_text = self.get_pdf_text(pdf, page=1)
10-
print(pdf_text)
10+
print("\n" + pdf_text)

help_docs/method_summary.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,11 @@ self.assert_no_404_errors(multithreaded=True)
221221

222222
self.print_unique_links_with_status_codes()
223223

224-
self.get_pdf_text(pdf, page=None)
224+
self.get_pdf_text(pdf, page=None, maxpages=None, password=None,
225+
codec='utf-8', wrap=False, nav=False, override=False)
225226

226-
self.assert_pdf_text(pdf, text, page=None)
227+
self.assert_pdf_text(pdf, text, page=None, maxpages=None, password=None,
228+
codec='utf-8', wrap=True, nav=False, override=False)
227229

228230
self.create_folder(folder)
229231

pytest.ini

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ addopts = --capture=no --ignore conftest.py -p no:cacheprovider
66
# Ignore warnings such as DeprecationWarning and pytest.PytestUnknownMarkWarning
77
filterwarnings = ignore::pytest.PytestWarning
88

9+
# Configure the junit_family option explicitly:
10+
junit_family = legacy
11+
912
# Set pytest discovery rules:
1013
# (Most of the rules here are similar to the default rules.)
1114
# (unittest.TestCase rules override the rules here for classes and functions.)

requirements.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ pytest>=4.6.6;python_version<"3"
1616
pytest>=5.3.1;python_version>="3"
1717
pytest-cov>=2.8.1
1818
pytest-forked>=1.1.3
19-
pytest-html==1.22.0
19+
pytest-html==1.22.1;python_version<"3.6"
20+
pytest-html==2.0.1;python_version>="3.6"
2021
pytest-metadata>=1.8.0
2122
pytest-ordering>=0.6
2223
pytest-rerunfailures>=8.0
@@ -31,10 +32,10 @@ asn1crypto>=1.2.0
3132
pyopenssl>=19.1.0
3233
colorama>=0.4.1
3334
pymysql>=0.9.3
34-
pypdf2>=1.26.0
3535
pyotp>=2.3.0
3636
boto>=2.49.0
3737
cffi>=1.13.2
3838
tqdm>=4.39.0
3939
flake8>=3.7.9
4040
certifi>=2019.9.11
41+
pdfminer.six==20191110

seleniumbase/console_scripts/sb_mkdir.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def main():
5555
data.append("addopts = --capture=no --ignore conftest.py "
5656
"-p no:cacheprovider")
5757
data.append("filterwarnings = ignore::pytest.PytestWarning")
58+
data.append("junit_family = legacy")
5859
data.append("python_files = test_*.py *_test.py *_tests.py *_suite.py")
5960
data.append("python_classes = Test* *Test* *Test *Tests *Suite")
6061
data.append("python_functions = test_*")

seleniumbase/fixtures/base_case.py

Lines changed: 78 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
19491949
soup = self.get_beautiful_soup(self.get_page_source())
19501950
page_utils._print_unique_links_with_status_codes(page_url, soup)
19511951

1952-
def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
1953-
import PyPDF2
1954-
pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
1955-
return pdf_reader_object
1956-
1957-
def get_pdf_text(self, pdf, page=None):
1952+
def __fix_unicode_conversion(self, text):
1953+
""" Fixing Chinese characters when converting from PDF to HTML. """
1954+
text = text.replace(u'\u2f8f', u'\u884c')
1955+
text = text.replace(u'\u2f45', u'\u65b9')
1956+
text = text.replace(u'\u2f08', u'\u4eba')
1957+
text = text.replace(u'\u2f70', u'\u793a')
1958+
return text
1959+
1960+
def get_pdf_text(self, pdf, page=None, maxpages=None,
1961+
password=None, codec='utf-8', wrap=False, nav=False,
1962+
override=False):
19581963
""" Gets text from a PDF file.
19591964
PDF can be either a URL or a file path on the local file system.
19601965
@Params
19611966
pdf - The URL or file path of the PDF file.
1962-
page - The page number of the PDF to use (optional).
1967+
page - The page number (or a list of page numbers) of the PDF.
19631968
If a page number is provided, looks only at that page.
19641969
(1 is the first page, 2 is the second page, etc.)
1965-
If no page number is provided, returns all PDF text. """
1970+
If no page number is provided, returns all PDF text.
1971+
maxpages - Instead of providing a page number, you can provide
1972+
the number of pages to use from the beginning.
1973+
password - If the PDF is password-protected, enter it here.
1974+
codec - The compression format for character encoding.
1975+
(The default codec used by this method is 'utf-8'.)
1976+
wrap - Replaces ' \n' with ' ' so that individual sentences
1977+
from a PDF don't get broken up into seperate lines when
1978+
getting converted into text format.
1979+
nav - If PDF is a URL, navigates to the URL in the browser first.
1980+
(Not needed because the PDF will be downloaded anyway.)
1981+
override - If the PDF file to be downloaded already exists in the
1982+
downloaded_files/ folder, that PDF will be used
1983+
instead of downloading it again. """
1984+
from pdfminer.high_level import extract_text
1985+
if not password:
1986+
password = ''
1987+
if not maxpages:
1988+
maxpages = 0
19661989
if not pdf.lower().endswith('.pdf'):
19671990
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
19681991
file_path = None
19691992
if page_utils.is_valid_url(pdf):
1970-
if self.get_current_url() != pdf:
1971-
self.open(pdf)
1972-
self.download_file(pdf)
1993+
if nav:
1994+
if self.get_current_url() != pdf:
1995+
self.open(pdf)
19731996
file_name = pdf.split('/')[-1]
19741997
file_path = self.get_downloads_folder() + '/' + file_name
1998+
if not os.path.exists(file_path):
1999+
self.download_file(pdf)
2000+
elif override:
2001+
self.download_file(pdf)
19752002
else:
19762003
if not os.path.exists(pdf):
19772004
raise Exception("%s is not a valid URL or file path!" % pdf)
19782005
file_path = os.path.abspath(pdf)
1979-
pdf_file_object = open(file_path, "rb")
1980-
pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
1981-
num_pages = pdf_reader.numPages
1982-
pdf_text = ""
1983-
if type(page) is int:
1984-
if page > num_pages:
1985-
raise Exception("Invalid page number for the PDF!")
2006+
page_search = None # (Pages are delimited by '\x0c')
2007+
if type(page) is list:
2008+
pages = page
2009+
page_search = []
2010+
for page in pages:
2011+
page_search.append(page - 1)
2012+
elif type(page) is int:
19862013
page = page - 1
1987-
page_obj = pdf_reader.getPage(page)
1988-
pdf_text = page_obj.extractText()
2014+
if page < 0:
2015+
page = 0
2016+
page_search = [page]
19892017
else:
1990-
for page_num in range(num_pages):
1991-
page_obj = pdf_reader.getPage(page_num)
1992-
pdf_text = pdf_text + '\n' + page_obj.extractText()
2018+
page_search = None
2019+
pdf_text = extract_text(
2020+
file_path, password='', page_numbers=page_search,
2021+
maxpages=maxpages, caching=False, codec=codec)
2022+
pdf_text = self.__fix_unicode_conversion(pdf_text)
2023+
if wrap:
2024+
pdf_text = pdf_text.replace(' \n', ' ')
19932025
return pdf_text
19942026

1995-
def assert_pdf_text(self, pdf, text, page=None):
2027+
def assert_pdf_text(self, pdf, text, page=None, maxpages=None,
2028+
password=None, codec='utf-8', wrap=True, nav=False,
2029+
override=False):
19962030
""" Asserts text in a PDF file.
19972031
PDF can be either a URL or a file path on the local file system.
19982032
@Params
@@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
20012035
page - The page number of the PDF to use (optional).
20022036
If a page number is provided, looks only at that page.
20032037
(1 is the first page, 2 is the second page, etc.)
2004-
If no page number is provided, looks at all the pages. """
2005-
pdf_text = self.get_pdf_text(pdf, page=page)
2038+
If no page number is provided, looks at all the pages.
2039+
maxpages - Instead of providing a page number, you can provide
2040+
the number of pages to use from the beginning.
2041+
password - If the PDF is password-protected, enter it here.
2042+
codec - The compression format for character encoding.
2043+
(The default codec used by this method is 'utf-8'.)
2044+
wrap - Replaces ' \n' with ' ' so that individual sentences
2045+
from a PDF don't get broken up into seperate lines when
2046+
getting converted into text format.
2047+
nav - If PDF is a URL, navigates to the URL in the browser first.
2048+
(Not needed because the PDF will be downloaded anyway.)
2049+
override - If the PDF file to be downloaded already exists in the
2050+
downloaded_files/ folder, that PDF will be used
2051+
instead of downloading it again. """
2052+
text = self.__fix_unicode_conversion(text)
2053+
if not codec:
2054+
codec = 'utf-8'
2055+
pdf_text = self.get_pdf_text(
2056+
pdf, page=page, maxpages=maxpages, password=password, codec=codec,
2057+
wrap=wrap, nav=nav, override=override)
20062058
if type(page) is int:
20072059
if text not in pdf_text:
20082060
raise Exception("PDF [%s] is missing expected text [%s] on "

setup.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
setup(
4747
name='seleniumbase',
48-
version='1.33.7',
48+
version='1.33.8',
4949
description='Fast, Easy, and Reliable Browser Automation & Testing.',
5050
long_description=long_description,
5151
long_description_content_type='text/markdown',
@@ -99,7 +99,8 @@
9999
'pytest>=5.3.1;python_version>="3"',
100100
'pytest-cov>=2.8.1',
101101
'pytest-forked>=1.1.3',
102-
'pytest-html==1.22.0', # Keep at 1.22.0 unless tested on Windows
102+
'pytest-html==1.22.1;python_version<"3.6"',
103+
'pytest-html==2.0.1;python_version>="3.6"',
103104
'pytest-metadata>=1.8.0',
104105
'pytest-ordering>=0.6',
105106
'pytest-rerunfailures>=8.0',
@@ -114,13 +115,13 @@
114115
'pyopenssl>=19.1.0',
115116
'colorama>=0.4.1',
116117
'pymysql>=0.9.3',
117-
'pypdf2>=1.26.0',
118118
'pyotp>=2.3.0',
119119
'boto>=2.49.0',
120120
'cffi>=1.13.2',
121121
'tqdm>=4.39.0',
122122
'flake8>=3.7.9',
123123
'certifi>=2019.9.11',
124+
'pdfminer.six==20191110',
124125
],
125126
packages=[
126127
'seleniumbase',

0 commit comments

Comments
 (0)