Use black to format the code

azmeuk · azmeuk · commit e9acdd091b30 · 2020-01-30T17:32:43.000+01:00
diff --git a/readability/browser.py b/readability/browser.py
@@ -7,14 +7,15 @@ def open_in_browser(html):
     import os
     import webbrowser
     import tempfile
-    handle, fn = tempfile.mkstemp(suffix='.html')
-    f = os.fdopen(handle, 'wb')
+
+    handle, fn = tempfile.mkstemp(suffix=".html")
+    f = os.fdopen(handle, "wb")
     try:
         f.write(b"<meta charset='UTF-8' />")
-        f.write(html.encode('utf-8'))
+        f.write(html.encode("utf-8"))
     finally:
         # we leak the file itself here, but we should at least close it
         f.close()
-    url = 'file://' + fn.replace(os.path.sep, '/')
+    url = "file://" + fn.replace(os.path.sep, "/")
     webbrowser.open(url)
     return url
diff --git a/readability/cleaners.py b/readability/cleaners.py
@@ -2,35 +2,51 @@
 import re
 from lxml.html.clean import Cleaner
 
-bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
-non_space = '[^ "\'>]+'
-htmlstrip = re.compile("<" # open
-    "([^>]+) " # prefix
-    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
-    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
-    "([^>]*)"  # postfix
-    ">"        # end
-, re.I)
+non_space = "[^ \"'>]+"
+htmlstrip = re.compile(
+    "<"  # open
+    "([^>]+) "  # prefix
+    "(?:%s) *" % ("|".join(bad_attrs),)
+    + "= *(?:%s|%s|%s)"  # undesirable attributes
+    % (non_space, single_quoted, double_quoted)
+    + "([^>]*)"  # value  # postfix
+    ">",  # end
+    re.I,
+)
 
 
 def clean_attributes(html):
     while htmlstrip.search(html):
-        html = htmlstrip.sub('<\\1\\2>', html)
+        html = htmlstrip.sub("<\\1\\2>", html)
     return html
 
 
 def normalize_spaces(s):
     if not s:
-        return ''
+        return ""
     """replace any sequence of whitespace
     characters with a single space"""
-    return ' '.join(s.split())
+    return " ".join(s.split())
 
 
-html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
-                  style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True, embedded=False,
-                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
-                  remove_unknown_tags=False, safe_attrs_only=False)
+html_cleaner = Cleaner(
+    scripts=True,
+    javascript=True,
+    comments=True,
+    style=True,
+    links=True,
+    meta=False,
+    add_nofollow=False,
+    page_structure=False,
+    processing_instructions=True,
+    embedded=False,
+    frames=False,
+    forms=False,
+    annoying_tags=False,
+    remove_tags=None,
+    remove_unknown_tags=False,
+    safe_attrs_only=False,
+)
diff --git a/readability/compat/__init__.py b/readability/compat/__init__.py
@@ -5,10 +5,11 @@
 syntax that can only be solved by conditionally importing different functions.
 """
 import sys
+
 if sys.version_info[0] == 2:
     bytes_ = str
     str_ = unicode
-    
+
 elif sys.version_info[0] == 3:
     bytes_ = bytes
     str_ = str
diff --git a/readability/debug.py b/readability/debug.py
@@ -1,25 +1,25 @@
 import re
 
 
-#FIXME: use with caution, can leak memory
+# FIXME: use with caution, can leak memory
 uids = {}
 uids_document = None
 
 
 def describe_node(node):
     global uids
     if node is None:
-        return ''
-    if not hasattr(node, 'tag'):
+        return ""
+    if not hasattr(node, "tag"):
         return "[%s]" % type(node)
     name = node.tag
-    if node.get('id', ''):
-        name += '#' + node.get('id')
-    if node.get('class', '').strip():
-        name += '.' + '.'.join(node.get('class').split())
-    if name[:4] in ['div#', 'div.']:
+    if node.get("id", ""):
+        name += "#" + node.get("id")
+    if node.get("class", "").strip():
+        name += "." + ".".join(node.get("class").split())
+    if name[:4] in ["div#", "div."]:
         name = name[3:]
-    if name in ['tr', 'td', 'div', 'p']:
+    if name in ["tr", "td", "div", "p"]:
         uid = uids.get(node)
         if uid is None:
             uid = uids[node] = len(uids) + 1
@@ -34,18 +34,18 @@ def describe(node, depth=1):
         uids = {}
         uids_document = doc
 
-    #return repr(NodeRepr(node))
-    parent = ''
+    # return repr(NodeRepr(node))
+    parent = ""
     if depth and node.getparent() is not None:
-        parent = describe(node.getparent(), depth=depth - 1) + '>'
+        parent = describe(node.getparent(), depth=depth - 1) + ">"
     return parent + describe_node(node)
 
 
-RE_COLLAPSE_WHITESPACES = re.compile(r'\s+', re.U)
+RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
 
 
 def text_content(elem, length=40):
-    content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
+    content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
     if len(content) < length:
         return content
-    return content[:length] + '...'
+    return content[:length] + "..."
diff --git a/readability/encoding.py b/readability/encoding.py
@@ -8,15 +8,16 @@
 RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
 
 CHARSETS = {
-    'big5': 'big5hkscs',
-    'gb2312': 'gb18030',
-    'ascii': 'utf-8',
-    'maccyrillic': 'cp1251',
-    'win1251': 'cp1251',
-    'win-1251': 'cp1251',
-    'windows-1251': 'cp1251',
+    "big5": "big5hkscs",
+    "gb2312": "gb18030",
+    "ascii": "utf-8",
+    "maccyrillic": "cp1251",
+    "win1251": "cp1251",
+    "win-1251": "cp1251",
+    "windows-1251": "cp1251",
 }
 
+
 def fix_charset(encoding):
     """Overrides encoding when charset declaration
        or charset determination is a subset of a larger
@@ -27,9 +28,9 @@ def fix_charset(encoding):
 
 def get_encoding(page):
     # Regex for XML and HTML Meta charset declaration
-    declared_encodings = (RE_CHARSET.findall(page) +
-            RE_PRAGMA.findall(page) +
-            RE_XML.findall(page))
+    declared_encodings = (
+        RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
+    )
 
     # Try any declared encodings
     for declared_encoding in declared_encodings:
@@ -38,7 +39,7 @@ def get_encoding(page):
                 # declared_encoding will actually be bytes but .decode() only
                 # accepts `str` type. Decode blindly with ascii because no one should
                 # ever use non-ascii characters in the name of an encoding.
-                declared_encoding = declared_encoding.decode('ascii', 'replace')
+                declared_encoding = declared_encoding.decode("ascii", "replace")
 
             encoding = fix_charset(declared_encoding)
 
@@ -51,12 +52,12 @@ def get_encoding(page):
 
     # Fallback to chardet if declared encodings fail
     # Remove all HTML tags, and leave only text for chardet
-    text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip()
-    enc = 'utf-8'
+    text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
+    enc = "utf-8"
     if len(text) < 10:
-        return enc # can't guess
+        return enc  # can't guess
     res = chardet.detect(text)
-    enc = res['encoding'] or 'utf-8'
-    #print '->', enc, "%.2f" % res['confidence']
+    enc = res["encoding"] or "utf-8"
+    # print '->', enc, "%.2f" % res['confidence']
     enc = fix_charset(enc)
     return enc
diff --git a/readability/htmls.py b/readability/htmls.py
@@ -6,36 +6,38 @@
 from .encoding import get_encoding
 from .compat import str_
 
-utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
+utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
 
 
 def build_doc(page):
     if isinstance(page, str_):
         encoding = None
         decoded_page = page
     else:
-        encoding = get_encoding(page) or 'utf-8'
-        decoded_page = page.decode(encoding, 'replace')
+        encoding = get_encoding(page) or "utf-8"
+        decoded_page = page.decode(encoding, "replace")
 
     # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
-    doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
+    doc = lxml.html.document_fromstring(
+        decoded_page.encode("utf-8", "replace"), parser=utf8_parser
+    )
     return doc, encoding
 
 
 def js_re(src, pattern, flags, repl):
-    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
+    return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
 
 
 def normalize_entities(cur_title):
     entities = {
-        u'\u2014':'-',
-        u'\u2013':'-',
-        u'&mdash;': '-',
-        u'&ndash;': '-',
-        u'\u00A0': ' ',
-        u'\u00AB': '"',
-        u'\u00BB': '"',
-        u'&quot;': '"',
+        u"\u2014": "-",
+        u"\u2013": "-",
+        u"&mdash;": "-",
+        u"&ndash;": "-",
+        u"\u00A0": " ",
+        u"\u00AB": '"',
+        u"\u00BB": '"',
+        u"&quot;": '"',
     }
     for c, r in entities.items():
         if c in cur_title:
@@ -49,35 +51,44 @@ def norm_title(title):
 
 
 def get_title(doc):
-    title = doc.find('.//title')
+    title = doc.find(".//title")
     if title is None or title.text is None or len(title.text) == 0:
-        return '[no-title]'
+        return "[no-title]"
 
     return norm_title(title.text)
 
 
 def add_match(collection, text, orig):
     text = norm_title(text)
     if len(text.split()) >= 2 and len(text) >= 15:
-        if text.replace('"', '') in orig.replace('"', ''):
+        if text.replace('"', "") in orig.replace('"', ""):
             collection.add(text)
 
 
-TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
-                        '.news_title', '.title', '.head', '.heading',
-                        '.contentheading', '.small_header_red']
+TITLE_CSS_HEURISTICS = [
+    "#title",
+    "#head",
+    "#heading",
+    ".pageTitle",
+    ".news_title",
+    ".title",
+    ".head",
+    ".heading",
+    ".contentheading",
+    ".small_header_red",
+]
 
 
 def shorten_title(doc):
-    title = doc.find('.//title')
+    title = doc.find(".//title")
     if title is None or title.text is None or len(title.text) == 0:
-        return ''
+        return ""
 
     title = orig = norm_title(title.text)
 
     candidates = set()
 
-    for item in ['.//h1', './/h2', './/h3']:
+    for item in [".//h1", ".//h2", ".//h3"]:
         for e in list(doc.iterfind(item)):
             if e.text:
                 add_match(candidates, e.text, orig)
@@ -94,7 +105,7 @@ def shorten_title(doc):
     if candidates:
         title = sorted(candidates, key=len)[-1]
     else:
-        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
+        for delimiter in [" | ", " - ", " :: ", " / "]:
             if delimiter in title:
                 parts = orig.split(delimiter)
                 if len(parts[0].split()) >= 4:
@@ -104,12 +115,12 @@ def shorten_title(doc):
                     title = parts[-1]
                     break
         else:
-            if ': ' in title:
-                parts = orig.split(': ')
+            if ": " in title:
+                parts = orig.split(": ")
                 if len(parts[-1].split()) >= 4:
                     title = parts[-1]
                 else:
-                    title = orig.split(': ', 1)[1]
+                    title = orig.split(": ", 1)[1]
 
     if not 15 < len(title) < 150:
         return orig
@@ -119,15 +130,15 @@ def shorten_title(doc):
 
 # is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
 def get_body(doc):
-    for elem in doc.xpath('.//script | .//link | .//style'):
+    for elem in doc.xpath(".//script | .//link | .//style"):
         elem.drop_tree()
     # tostring() always return utf-8 encoded string
     # FIXME: isn't better to use tounicode?
     raw_html = str_(tostring(doc.body or doc))
     cleaned = clean_attributes(raw_html)
     try:
-        #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
+        # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
         return cleaned
-    except Exception: #FIXME find the equivalent lxml error
-        #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+    except Exception:  # FIXME find the equivalent lxml error
+        # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
         return raw_html
diff --git a/readability/readability.py b/readability/readability.py
diff --git a/setup.py b/setup.py
diff --git a/tests/test_article_only.py b/tests/test_article_only.py