66from .encoding import get_encoding
77from .compat import str_
88
9- utf8_parser = lxml .html .HTMLParser (encoding = ' utf-8' )
9+ utf8_parser = lxml .html .HTMLParser (encoding = " utf-8" )
1010
1111
1212def build_doc (page ):
1313 if isinstance (page , str_ ):
1414 encoding = None
1515 decoded_page = page
1616 else :
17- encoding = get_encoding (page ) or ' utf-8'
18- decoded_page = page .decode (encoding , ' replace' )
17+ encoding = get_encoding (page ) or " utf-8"
18+ decoded_page = page .decode (encoding , " replace" )
1919
2020 # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
21- doc = lxml .html .document_fromstring (decoded_page .encode ('utf-8' , 'replace' ), parser = utf8_parser )
21+ doc = lxml .html .document_fromstring (
22+ decoded_page .encode ("utf-8" , "replace" ), parser = utf8_parser
23+ )
2224 return doc , encoding
2325
2426
2527def js_re (src , pattern , flags , repl ):
26- return re .compile (pattern , flags ).sub (src , repl .replace ('$' , ' \\ ' ))
28+ return re .compile (pattern , flags ).sub (src , repl .replace ("$" , " \\ " ))
2729
2830
2931def normalize_entities (cur_title ):
3032 entities = {
31- u' \u2014 ' : '-' ,
32- u' \u2013 ' : '-' ,
33- u' —' : '-' ,
34- u' –' : '-' ,
35- u' \u00A0 ' : ' ' ,
36- u' \u00AB ' : '"' ,
37- u' \u00BB ' : '"' ,
38- u' "' : '"' ,
33+ u" \u2014 " : "-" ,
34+ u" \u2013 " : "-" ,
35+ u" —" : "-" ,
36+ u" –" : "-" ,
37+ u" \u00A0 " : " " ,
38+ u" \u00AB " : '"' ,
39+ u" \u00BB " : '"' ,
40+ u" "" : '"' ,
3941 }
4042 for c , r in entities .items ():
4143 if c in cur_title :
@@ -49,35 +51,44 @@ def norm_title(title):
4951
5052
5153def get_title (doc ):
52- title = doc .find (' .//title' )
54+ title = doc .find (" .//title" )
5355 if title is None or title .text is None or len (title .text ) == 0 :
54- return ' [no-title]'
56+ return " [no-title]"
5557
5658 return norm_title (title .text )
5759
5860
5961def add_match (collection , text , orig ):
6062 text = norm_title (text )
6163 if len (text .split ()) >= 2 and len (text ) >= 15 :
62- if text .replace ('"' , '' ) in orig .replace ('"' , '' ):
64+ if text .replace ('"' , "" ) in orig .replace ('"' , "" ):
6365 collection .add (text )
6466
6567
66- TITLE_CSS_HEURISTICS = ['#title' , '#head' , '#heading' , '.pageTitle' ,
67- '.news_title' , '.title' , '.head' , '.heading' ,
68- '.contentheading' , '.small_header_red' ]
68+ TITLE_CSS_HEURISTICS = [
69+ "#title" ,
70+ "#head" ,
71+ "#heading" ,
72+ ".pageTitle" ,
73+ ".news_title" ,
74+ ".title" ,
75+ ".head" ,
76+ ".heading" ,
77+ ".contentheading" ,
78+ ".small_header_red" ,
79+ ]
6980
7081
7182def shorten_title (doc ):
72- title = doc .find (' .//title' )
83+ title = doc .find (" .//title" )
7384 if title is None or title .text is None or len (title .text ) == 0 :
74- return ''
85+ return ""
7586
7687 title = orig = norm_title (title .text )
7788
7889 candidates = set ()
7990
80- for item in [' .//h1' , ' .//h2' , ' .//h3' ]:
91+ for item in [" .//h1" , " .//h2" , " .//h3" ]:
8192 for e in list (doc .iterfind (item )):
8293 if e .text :
8394 add_match (candidates , e .text , orig )
@@ -94,7 +105,7 @@ def shorten_title(doc):
94105 if candidates :
95106 title = sorted (candidates , key = len )[- 1 ]
96107 else :
97- for delimiter in [' | ' , ' - ' , ' :: ' , ' / ' ]:
108+ for delimiter in [" | " , " - " , " :: " , " / " ]:
98109 if delimiter in title :
99110 parts = orig .split (delimiter )
100111 if len (parts [0 ].split ()) >= 4 :
@@ -104,12 +115,12 @@ def shorten_title(doc):
104115 title = parts [- 1 ]
105116 break
106117 else :
107- if ': ' in title :
108- parts = orig .split (': ' )
118+ if ": " in title :
119+ parts = orig .split (": " )
109120 if len (parts [- 1 ].split ()) >= 4 :
110121 title = parts [- 1 ]
111122 else :
112- title = orig .split (': ' , 1 )[1 ]
123+ title = orig .split (": " , 1 )[1 ]
113124
114125 if not 15 < len (title ) < 150 :
115126 return orig
@@ -119,15 +130,15 @@ def shorten_title(doc):
119130
120131# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
121132def get_body (doc ):
122- for elem in doc .xpath (' .//script | .//link | .//style' ):
133+ for elem in doc .xpath (" .//script | .//link | .//style" ):
123134 elem .drop_tree ()
124135 # tostring() always return utf-8 encoded string
125136 # FIXME: isn't better to use tounicode?
126137 raw_html = str_ (tostring (doc .body or doc ))
127138 cleaned = clean_attributes (raw_html )
128139 try :
129- #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
140+ # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
130141 return cleaned
131- except Exception : # FIXME find the equivalent lxml error
132- #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
142+ except Exception : # FIXME find the equivalent lxml error
143+ # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
133144 return raw_html
0 commit comments