@@ -1949,50 +1949,84 @@ def print_unique_links_with_status_codes(self):
1949
1949
soup = self .get_beautiful_soup (self .get_page_source ())
1950
1950
page_utils ._print_unique_links_with_status_codes (page_url , soup )
1951
1951
1952
- def __get_pdf_reader_obj (self , pdf_file_object , strict = False ):
1953
- import PyPDF2
1954
- pdf_reader_object = PyPDF2 .PdfFileReader (pdf_file_object , strict )
1955
- return pdf_reader_object
1956
-
1957
- def get_pdf_text (self , pdf , page = None ):
1952
+ def __fix_unicode_conversion (self , text ):
1953
+ """ Fixing Chinese characters when converting from PDF to HTML. """
1954
+ text = text .replace (u'\u2f8f ' , u'\u884c ' )
1955
+ text = text .replace (u'\u2f45 ' , u'\u65b9 ' )
1956
+ text = text .replace (u'\u2f08 ' , u'\u4eba ' )
1957
+ text = text .replace (u'\u2f70 ' , u'\u793a ' )
1958
+ return text
1959
+
1960
+ def get_pdf_text (self , pdf , page = None , maxpages = None ,
1961
+ password = None , codec = 'utf-8' , wrap = False , nav = False ,
1962
+ override = False ):
1958
1963
""" Gets text from a PDF file.
1959
1964
PDF can be either a URL or a file path on the local file system.
1960
1965
@Params
1961
1966
pdf - The URL or file path of the PDF file.
1962
- page - The page number of the PDF to use (optional) .
1967
+ page - The page number (or a list of page numbers) of the PDF .
1963
1968
If a page number is provided, looks only at that page.
1964
1969
(1 is the first page, 2 is the second page, etc.)
1965
- If no page number is provided, returns all PDF text. """
1970
+ If no page number is provided, returns all PDF text.
1971
+ maxpages - Instead of providing a page number, you can provide
1972
+ the number of pages to use from the beginning.
1973
+ password - If the PDF is password-protected, enter it here.
1974
+ codec - The compression format for character encoding.
1975
+ (The default codec used by this method is 'utf-8'.)
1976
+ wrap - Replaces ' \n ' with ' ' so that individual sentences
1977
+ from a PDF don't get broken up into seperate lines when
1978
+ getting converted into text format.
1979
+ nav - If PDF is a URL, navigates to the URL in the browser first.
1980
+ (Not needed because the PDF will be downloaded anyway.)
1981
+ override - If the PDF file to be downloaded already exists in the
1982
+ downloaded_files/ folder, that PDF will be used
1983
+ instead of downloading it again. """
1984
+ from pdfminer .high_level import extract_text
1985
+ if not password :
1986
+ password = ''
1987
+ if not maxpages :
1988
+ maxpages = 0
1966
1989
if not pdf .lower ().endswith ('.pdf' ):
1967
1990
raise Exception ("%s is not a PDF file! (Expecting a .pdf)" % pdf )
1968
1991
file_path = None
1969
1992
if page_utils .is_valid_url (pdf ):
1970
- if self . get_current_url () != pdf :
1971
- self .open ( pdf )
1972
- self .download_file (pdf )
1993
+ if nav :
1994
+ if self .get_current_url () != pdf :
1995
+ self .open (pdf )
1973
1996
file_name = pdf .split ('/' )[- 1 ]
1974
1997
file_path = self .get_downloads_folder () + '/' + file_name
1998
+ if not os .path .exists (file_path ):
1999
+ self .download_file (pdf )
2000
+ elif override :
2001
+ self .download_file (pdf )
1975
2002
else :
1976
2003
if not os .path .exists (pdf ):
1977
2004
raise Exception ("%s is not a valid URL or file path!" % pdf )
1978
2005
file_path = os .path .abspath (pdf )
1979
- pdf_file_object = open ( file_path , "rb" )
1980
- pdf_reader = self . __get_pdf_reader_obj ( pdf_file_object , strict = False )
1981
- num_pages = pdf_reader . numPages
1982
- pdf_text = ""
1983
- if type ( page ) is int :
1984
- if page > num_pages :
1985
- raise Exception ( "Invalid page number for the PDF!" )
2006
+ page_search = None # (Pages are delimited by '\x0c' )
2007
+ if type ( page ) is list :
2008
+ pages = page
2009
+ page_search = []
2010
+ for page in pages :
2011
+ page_search . append ( page - 1 )
2012
+ elif type ( page ) is int :
1986
2013
page = page - 1
1987
- page_obj = pdf_reader .getPage (page )
1988
- pdf_text = page_obj .extractText ()
2014
+ if page < 0 :
2015
+ page = 0
2016
+ page_search = [page ]
1989
2017
else :
1990
- for page_num in range (num_pages ):
1991
- page_obj = pdf_reader .getPage (page_num )
1992
- pdf_text = pdf_text + '\n ' + page_obj .extractText ()
2018
+ page_search = None
2019
+ pdf_text = extract_text (
2020
+ file_path , password = '' , page_numbers = page_search ,
2021
+ maxpages = maxpages , caching = False , codec = codec )
2022
+ pdf_text = self .__fix_unicode_conversion (pdf_text )
2023
+ if wrap :
2024
+ pdf_text = pdf_text .replace (' \n ' , ' ' )
1993
2025
return pdf_text
1994
2026
1995
- def assert_pdf_text (self , pdf , text , page = None ):
2027
+ def assert_pdf_text (self , pdf , text , page = None , maxpages = None ,
2028
+ password = None , codec = 'utf-8' , wrap = True , nav = False ,
2029
+ override = False ):
1996
2030
""" Asserts text in a PDF file.
1997
2031
PDF can be either a URL or a file path on the local file system.
1998
2032
@Params
@@ -2001,8 +2035,26 @@ def assert_pdf_text(self, pdf, text, page=None):
2001
2035
page - The page number of the PDF to use (optional).
2002
2036
If a page number is provided, looks only at that page.
2003
2037
(1 is the first page, 2 is the second page, etc.)
2004
- If no page number is provided, looks at all the pages. """
2005
- pdf_text = self .get_pdf_text (pdf , page = page )
2038
+ If no page number is provided, looks at all the pages.
2039
+ maxpages - Instead of providing a page number, you can provide
2040
+ the number of pages to use from the beginning.
2041
+ password - If the PDF is password-protected, enter it here.
2042
+ codec - The compression format for character encoding.
2043
+ (The default codec used by this method is 'utf-8'.)
2044
+ wrap - Replaces ' \n ' with ' ' so that individual sentences
2045
+ from a PDF don't get broken up into seperate lines when
2046
+ getting converted into text format.
2047
+ nav - If PDF is a URL, navigates to the URL in the browser first.
2048
+ (Not needed because the PDF will be downloaded anyway.)
2049
+ override - If the PDF file to be downloaded already exists in the
2050
+ downloaded_files/ folder, that PDF will be used
2051
+ instead of downloading it again. """
2052
+ text = self .__fix_unicode_conversion (text )
2053
+ if not codec :
2054
+ codec = 'utf-8'
2055
+ pdf_text = self .get_pdf_text (
2056
+ pdf , page = page , maxpages = maxpages , password = password , codec = codec ,
2057
+ wrap = wrap , nav = nav , override = override )
2006
2058
if type (page ) is int :
2007
2059
if text not in pdf_text :
2008
2060
raise Exception ("PDF [%s] is missing expected text [%s] on "
0 commit comments