Skip to content

Commit c64665b

Browse files
committed
Allow the use of Chinese characters in PDF testing
1 parent 1441669 commit c64665b

File tree

1 file changed

+78
-26
lines changed

1 file changed

+78
-26
lines changed

seleniumbase/fixtures/base_case.py

Lines changed: 78 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
19491949
soup = self.get_beautiful_soup(self.get_page_source())
19501950
page_utils._print_unique_links_with_status_codes(page_url, soup)
19511951

1952-
def __get_pdf_reader_obj(self, pdf_file_object, strict=False):
1953-
import PyPDF2
1954-
pdf_reader_object = PyPDF2.PdfFileReader(pdf_file_object, strict)
1955-
return pdf_reader_object
1956-
1957-
def get_pdf_text(self, pdf, page=None):
1952+
def __fix_unicode_conversion(self, text):
1953+
""" Fixing Chinese characters when converting from PDF to HTML. """
1954+
text = text.replace(u'\u2f8f', u'\u884c')
1955+
text = text.replace(u'\u2f45', u'\u65b9')
1956+
text = text.replace(u'\u2f08', u'\u4eba')
1957+
text = text.replace(u'\u2f70', u'\u793a')
1958+
return text
1959+
1960+
def get_pdf_text(self, pdf, page=None, maxpages=None,
1961+
password=None, codec='utf-8', wrap=False, nav=False,
1962+
override=False):
19581963
""" Gets text from a PDF file.
19591964
PDF can be either a URL or a file path on the local file system.
19601965
@Params
19611966
pdf - The URL or file path of the PDF file.
1962-
page - The page number of the PDF to use (optional).
1967+
page - The page number (or a list of page numbers) of the PDF.
19631968
If a page number is provided, looks only at that page.
19641969
(1 is the first page, 2 is the second page, etc.)
1965-
If no page number is provided, returns all PDF text. """
1970+
If no page number is provided, returns all PDF text.
1971+
maxpages - Instead of providing a page number, you can provide
1972+
the number of pages to use from the beginning.
1973+
password - If the PDF is password-protected, enter it here.
1974+
codec - The compression format for character encoding.
1975+
(The default codec used by this method is 'utf-8'.)
1976+
wrap - Replaces ' \n' with ' ' so that individual sentences
1977+
from a PDF don't get broken up into seperate lines when
1978+
getting converted into text format.
1979+
nav - If PDF is a URL, navigates to the URL in the browser first.
1980+
(Not needed because the PDF will be downloaded anyway.)
1981+
override - If the PDF file to be downloaded already exists in the
1982+
downloaded_files/ folder, that PDF will be used
1983+
instead of downloading it again. """
1984+
from pdfminer.high_level import extract_text
1985+
if not password:
1986+
password = ''
1987+
if not maxpages:
1988+
maxpages = 0
19661989
if not pdf.lower().endswith('.pdf'):
19671990
raise Exception("%s is not a PDF file! (Expecting a .pdf)" % pdf)
19681991
file_path = None
19691992
if page_utils.is_valid_url(pdf):
1970-
if self.get_current_url() != pdf:
1971-
self.open(pdf)
1972-
self.download_file(pdf)
1993+
if nav:
1994+
if self.get_current_url() != pdf:
1995+
self.open(pdf)
19731996
file_name = pdf.split('/')[-1]
19741997
file_path = self.get_downloads_folder() + '/' + file_name
1998+
if not os.path.exists(file_path):
1999+
self.download_file(pdf)
2000+
elif override:
2001+
self.download_file(pdf)
19752002
else:
19762003
if not os.path.exists(pdf):
19772004
raise Exception("%s is not a valid URL or file path!" % pdf)
19782005
file_path = os.path.abspath(pdf)
1979-
pdf_file_object = open(file_path, "rb")
1980-
pdf_reader = self.__get_pdf_reader_obj(pdf_file_object, strict=False)
1981-
num_pages = pdf_reader.numPages
1982-
pdf_text = ""
1983-
if type(page) is int:
1984-
if page > num_pages:
1985-
raise Exception("Invalid page number for the PDF!")
2006+
page_search = None # (Pages are delimited by '\x0c')
2007+
if type(page) is list:
2008+
pages = page
2009+
page_search = []
2010+
for page in pages:
2011+
page_search.append(page - 1)
2012+
elif type(page) is int:
19862013
page = page - 1
1987-
page_obj = pdf_reader.getPage(page)
1988-
pdf_text = page_obj.extractText()
2014+
if page < 0:
2015+
page = 0
2016+
page_search = [page]
19892017
else:
1990-
for page_num in range(num_pages):
1991-
page_obj = pdf_reader.getPage(page_num)
1992-
pdf_text = pdf_text + '\n' + page_obj.extractText()
2018+
page_search = None
2019+
pdf_text = extract_text(
2020+
file_path, password='', page_numbers=page_search,
2021+
maxpages=maxpages, caching=False, codec=codec)
2022+
pdf_text = self.__fix_unicode_conversion(pdf_text)
2023+
if wrap:
2024+
pdf_text = pdf_text.replace(' \n', ' ')
19932025
return pdf_text
19942026

1995-
def assert_pdf_text(self, pdf, text, page=None):
2027+
def assert_pdf_text(self, pdf, text, page=None, maxpages=None,
2028+
password=None, codec='utf-8', wrap=True, nav=False,
2029+
override=False):
19962030
""" Asserts text in a PDF file.
19972031
PDF can be either a URL or a file path on the local file system.
19982032
@Params
@@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
20012035
page - The page number of the PDF to use (optional).
20022036
If a page number is provided, looks only at that page.
20032037
(1 is the first page, 2 is the second page, etc.)
2004-
If no page number is provided, looks at all the pages. """
2005-
pdf_text = self.get_pdf_text(pdf, page=page)
2038+
If no page number is provided, looks at all the pages.
2039+
maxpages - Instead of providing a page number, you can provide
2040+
the number of pages to use from the beginning.
2041+
password - If the PDF is password-protected, enter it here.
2042+
codec - The compression format for character encoding.
2043+
(The default codec used by this method is 'utf-8'.)
2044+
wrap - Replaces ' \n' with ' ' so that individual sentences
2045+
from a PDF don't get broken up into seperate lines when
2046+
getting converted into text format.
2047+
nav - If PDF is a URL, navigates to the URL in the browser first.
2048+
(Not needed because the PDF will be downloaded anyway.)
2049+
override - If the PDF file to be downloaded already exists in the
2050+
downloaded_files/ folder, that PDF will be used
2051+
instead of downloading it again. """
2052+
text = self.__fix_unicode_conversion(text)
2053+
if not codec:
2054+
codec = 'utf-8'
2055+
pdf_text = self.get_pdf_text(
2056+
pdf, page=page, maxpages=maxpages, password=password, codec=codec,
2057+
wrap=wrap, nav=nav, override=override)
20062058
if type(page) is int:
20072059
if text not in pdf_text:
20082060
raise Exception("PDF [%s] is missing expected text [%s] on "

0 commit comments

Comments
 (0)