2323log = logging .getLogger ("readability.readability" )
2424
2525REGEXES = {
26- 'unlikelyCandidatesRe' : re .compile ('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter' , re .I ),
27- 'okMaybeItsACandidateRe' : re .compile ('and|article|body|column|main|shadow' , re .I ),
28- 'positiveRe' : re .compile ('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story' , re .I ),
29- 'negativeRe' : re .compile ('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget' , re .I ),
30- 'divToPElementsRe' : re .compile ('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
31- #'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
32- #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
33- #'trimRe': re.compile('^\s+|\s+$/'),
34- #'normalizeRe': re.compile('\s{2,}/'),
35- #'killBreaksRe': re.compile('(<br\s*\/?>(\s| ?)*){1,}/'),
36- 'videoRe' : re .compile ('https?:\/\/(www\.)?(youtube|vimeo)\.com' , re .I ),
26+ 'unlikelyCandidatesRe' : re .compile (r 'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter' , re .I ),
27+ 'okMaybeItsACandidateRe' : re .compile (r 'and|article|body|column|main|shadow' , re .I ),
28+ 'positiveRe' : re .compile (r 'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story' , re .I ),
29+ 'negativeRe' : re .compile (r 'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget' , re .I ),
30+ 'divToPElementsRe' : re .compile (r '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)' , re .I ),
31+ #'replaceBrsRe': re.compile(r '(<br[^>]*>[ \n\r\t]*){2,}',re.I),
32+ #'replaceFontsRe': re.compile(r '<(\/?)font[^>]*>',re.I),
33+ #'trimRe': re.compile(r '^\s+|\s+$/'),
34+ #'normalizeRe': re.compile(r '\s{2,}/'),
35+ #'killBreaksRe': re.compile(r '(<br\s*\/?>(\s| ?)*){1,}/'),
36+ 'videoRe' : re .compile (r 'https?:\/\/(www\.)?(youtube|vimeo)\.com' , re .I ),
3737 #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
3838}
3939
@@ -57,8 +57,8 @@ def clean(text):
5757 # Many spaces make the following regexes run forever
5858 text = re .sub (r'\s{255,}' , ' ' * 255 , text )
5959
60- text = re .sub ('\s*\n \s*' , '\n ' , text )
61- text = re .sub ('\t |[ \t ]{2,}' , ' ' , text )
60+ text = re .sub (r '\s*\n\s*' , '\n ' , text )
61+ text = re .sub (r '\t|[ \t]{2,}' , ' ' , text )
6262 return text .strip ()
6363
6464
@@ -271,7 +271,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
271271 append = True
272272 elif node_length <= 80 \
273273 and link_density == 0 \
274- and re .search ('\.( |$)' , node_content ):
274+ and re .search (r '\.( |$)' , node_content ):
275275 append = True
276276
277277 if append :
0 commit comments