black

lucywang000 · lucywang000 · commit 5fae2b972729 · 2021-08-09T14:24:43.000+08:00
diff --git a/tests/test_encoding.py b/tests/test_encoding.py
@@ -40,7 +40,7 @@ def test_bom(self):
             bom_encoding, bom = read_bom(string)
             assert bom_encoding is not None
             assert bom is not None
-            decoded = string[len(bom):].decode(bom_encoding)
+            decoded = string[len(bom) :].decode(bom_encoding)
             self.assertEqual(water_unicode, decoded)
         # Body without BOM
         enc, bom = read_bom(b"foo")
diff --git a/tests/test_http.py b/tests/test_http.py
@@ -1,7 +1,11 @@
 import unittest
 from collections import OrderedDict
-from w3lib.http import (HeadersDictInput, basic_auth_header,
-                        headers_dict_to_raw, headers_raw_to_dict)
+from w3lib.http import (
+    HeadersDictInput,
+    basic_auth_header,
+    headers_dict_to_raw,
+    headers_raw_to_dict,
+)
 
 __doctests__ = ["w3lib.http"]  # for trial support
 
@@ -48,10 +52,9 @@ def test_headers_dict_to_raw(self):
         )
 
     def test_headers_dict_to_raw_listtuple(self):
-        dct: HeadersDictInput = OrderedDict([
-            (b'Content-type', [b'text/html']),
-            (b'Accept', [b'gzip'])
-        ])
+        dct: HeadersDictInput = OrderedDict(
+            [(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]
+        )
         self.assertEqual(
             headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
         )
@@ -74,14 +77,13 @@ def test_headers_dict_to_raw_listtuple(self):
         )
 
     def test_headers_dict_to_raw_wrong_values(self):
-        dct: HeadersDictInput = OrderedDict([
-            (b'Content-type', 0),
-        ])
-        self.assertEqual(
-            headers_dict_to_raw(dct),
-            b''
+        dct: HeadersDictInput = OrderedDict(
+            [
+                (b"Content-type", 0),
+            ]
         )
         self.assertEqual(headers_dict_to_raw(dct), b"")
+        self.assertEqual(headers_dict_to_raw(dct), b"")
 
         dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
         self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self):
         )
 
     def test_add_or_replace_parameters_does_not_change_input_param(self):
-        url = 'http://domain/test?arg=original'
-        input_param = {'arg': 'value'}
+        url = "http://domain/test?arg=original"
+        input_param = {"arg": "value"}
         add_or_replace_parameters(url, input_param)  # noqa
-        self.assertEqual(input_param, {'arg': 'value'})
+        self.assertEqual(input_param, {"arg": "value"})
 
     def test_url_query_cleaner(self):
         self.assertEqual("product.html", url_query_cleaner("product.html?"))
@@ -814,17 +814,25 @@ def test_normalize_percent_encoding_in_query_arguments(self):
         )
 
     def test_non_ascii_percent_encoding_in_paths(self):
-        self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
-                                          "http://www.example.com/a%20do?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a do?a=1"),
+            "http://www.example.com/a%20do?a=1",
+        )
 
-        self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
-                                          "http://www.example.com/a%20%20do?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a %20do?a=1"),
+            "http://www.example.com/a%20%20do?a=1",
+        )
 
-        self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"),
-                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
+        self.assertEqual(
+            canonicalize_url("http://www.example.com/a do£.html?a=1"),
+            "http://www.example.com/a%20do%C2%A3.html?a=1",
+        )
 
-        self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
-                                          "http://www.example.com/a%20do%C2%A3.html?a=1")
+        self.assertEqual(
+            canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
+            "http://www.example.com/a%20do%C2%A3.html?a=1",
+        )
 
     def test_non_ascii_percent_encoding_in_query_arguments(self):
         self.assertEqual(
diff --git a/w3lib/encoding.py b/w3lib/encoding.py
@@ -26,6 +26,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
 
     return None
 
+
 # regexp for parsing HTTP meta tags
 _TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
 _SKIP_ATTRS = """(?:\\s+
@@ -124,6 +125,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
     "zh_cn": "gb18030",
 }
 
+
 def _c18n_encoding(encoding: str) -> str:
     """Canonicalize an encoding name
 
@@ -195,7 +197,9 @@ def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
 
 # Python decoder doesn't follow unicode standard when handling
 # bad utf-8 encoded strings. see http://bugs.python.org/issue8271
-codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end))
+codecs.register_error(
+    "w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end)
+)
 
 
 def to_unicode(data_str: bytes, encoding: str) -> str:
@@ -209,8 +213,12 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
     )
 
 
-def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
-                    default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]:
+def html_to_unicode(
+    content_type_header: Optional[str],
+    html_body_str: bytes,
+    default_encoding: str = "utf8",
+    auto_detect_fun: Optional[Callable[[bytes], str]] = None,
+) -> Tuple[str, str]:
     r'''Convert raw html bytes to unicode
 
     This attempts to make a reasonable guess at the content encoding of the
@@ -279,20 +287,20 @@ def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
         # remove BOM if it agrees with the encoding
         if enc == bom_enc:
             bom = cast(bytes, bom)
-            html_body_str = html_body_str[len(bom):]
-        elif enc == 'utf-16' or enc == 'utf-32':
+            html_body_str = html_body_str[len(bom) :]
+        elif enc == "utf-16" or enc == "utf-32":
             # read endianness from BOM, or default to big endian
             # tools.ietf.org/html/rfc2781 section 4.3
             if bom_enc is not None and bom_enc.startswith(enc):
                 enc = bom_enc
                 bom = cast(bytes, bom)
-                html_body_str = html_body_str[len(bom):]
+                html_body_str = html_body_str[len(bom) :]
             else:
                 enc += "-be"
         return enc, to_unicode(html_body_str, enc)
     if bom_enc is not None:
         bom = cast(bytes, bom)
-        return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
+        return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
     enc = html_body_declared_encoding(html_body_str)
     if enc is None and (auto_detect_fun is not None):
         enc = auto_detect_fun(html_body_str)
diff --git a/w3lib/html.py b/w3lib/html.py
@@ -11,16 +11,29 @@
 from w3lib.url import safe_url_string
 from w3lib._types import StrOrBytes
 
-_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
-_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
-_baseurl_re = re.compile(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
-_meta_refresh_re = re.compile(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
-_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
-
-HTML5_WHITESPACE = ' \t\n\r\x0c'
-
-
-def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
+_ent_re = re.compile(
+    r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
+    re.IGNORECASE,
+)
+_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL)
+_baseurl_re = re.compile(r"<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I)
+_meta_refresh_re = re.compile(
+    r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
+    re.DOTALL | re.IGNORECASE,
+)
+_cdata_re = re.compile(
+    r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
+)
+
+HTML5_WHITESPACE = " \t\n\r\x0c"
+
+
+def replace_entities(
+    text: AnyStr,
+    keep: Iterable[str] = (),
+    remove_illegal: bool = True,
+    encoding: str = "utf-8",
+) -> str:
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -51,12 +64,12 @@ def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: boo
     def convert_entity(m: Match) -> str:
         groups = m.groupdict()
         number = None
-        if groups.get('dec'):
-            number = int(groups['dec'], 10)
-        elif groups.get('hex'):
-            number = int(groups['hex'], 16)
-        elif groups.get('named'):
-            entity_name = groups['named']
+        if groups.get("dec"):
+            number = int(groups["dec"], 10)
+        elif groups.get("hex"):
+            number = int(groups["hex"], 16)
+        elif groups.get("named"):
+            entity_name = groups["named"]
             if entity_name.lower() in keep:
                 return m.group(0)
             else:
@@ -80,11 +93,12 @@ def convert_entity(m: Match) -> str:
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
+
 def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
 
-def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
+def replace_tags(text: AnyStr, token: str = "", encoding: Optional[str] = None) -> str:
     """Replace all markup tags found in the given `text` by the given token.
     By default `token` is an empty string so it just removes all tags.
 
@@ -107,11 +121,11 @@ def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None)
     return _tag_re.sub(token, to_unicode(text, encoding))
 
 
-_REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
+_REMOVECOMMENTS_RE = re.compile("<!--.*?(?:-->|$)", re.DOTALL)
 
 
 def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
-    """ Remove HTML Comments.
+    """Remove HTML Comments.
 
     >>> import w3lib.html
     >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
@@ -121,10 +135,16 @@ def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
     """
 
     utext = to_unicode(text, encoding)
-    return _REMOVECOMMENTS_RE.sub('', utext)
+    return _REMOVECOMMENTS_RE.sub("", utext)
+
 
-def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
-    """ Remove HTML Tags only.
+def remove_tags(
+    text: AnyStr,
+    which_ones: Iterable[str] = (),
+    keep: Iterable[str] = (),
+    encoding: Optional[str] = None,
+) -> str:
+    """Remove HTML Tags only.
 
     `which_ones` and `keep` are both tuples, there are four cases:
 
@@ -190,7 +210,9 @@ def remove_tag(m: Match) -> str:
     return retags.sub(remove_tag, to_unicode(text, encoding))
 
 
-def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
+def remove_tags_with_content(
+    text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None
+) -> str:
     """Remove tags and their content.
 
     `which_ones` is a tuple of which tags to remove including their content.
@@ -210,12 +232,16 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod
             [r"<%s\b.*?</%s>|<%s\s*/>" % (tag, tag, tag) for tag in which_ones]
         )
         retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
-        utext = retags.sub('', utext)
+        utext = retags.sub("", utext)
     return utext
 
 
-def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \
-        encoding: Optional[str] = None) -> str:
+def replace_escape_chars(
+    text: AnyStr,
+    which_ones: Iterable[str] = ("\n", "\t", "\r"),
+    replace_by: StrOrBytes = "",
+    encoding: Optional[str] = None,
+) -> str:
     """Remove escape characters.
 
     `which_ones` is a tuple of which escape characters we want to remove.
@@ -232,7 +258,12 @@ def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t',
     return utext
 
 
-def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
+def unquote_markup(
+    text: AnyStr,
+    keep: Iterable[str] = (),
+    remove_illegal: bool = True,
+    encoding: Optional[str] = None,
+) -> str:
     """
     This function receives markup as a text (always a unicode string or
     a UTF-8 encoded string) and does the following:
@@ -254,7 +285,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
         yield txt[offset:]
 
     utext = to_unicode(text, encoding)
-    ret_text = ''
+    ret_text = ""
     for fragment in _get_fragments(utext, _cdata_re):
         if isinstance(fragment, str):
             # it's not a CDATA (so we try to remove its entities)
@@ -266,7 +297,10 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
             ret_text += fragment.group("cdata_d")
     return ret_text
 
-def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str:
+
+def get_base_url(
+    text: AnyStr, baseurl: StrOrBytes = "", encoding: str = "utf-8"
+) -> str:
     """Return the base url if declared in the given HTML `text`,
     relative to the given base url.
 
@@ -284,7 +318,12 @@ def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8'
         return safe_url_string(baseurl)
 
 
-def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
+def get_meta_refresh(
+    text: AnyStr,
+    baseurl: str = "",
+    encoding: str = "utf-8",
+    ignore_tags: Iterable[str] = ("script", "noscript"),
+) -> Tuple[Optional[float], Optional[str]]:
     """Return  the http-equiv parameter of the HTML meta element from the given
     HTML text and return a tuple ``(interval, url)`` where interval is an integer
     containing the delay in seconds (or zero if not present) and url is a
diff --git a/w3lib/http.py b/w3lib/http.py
@@ -83,7 +83,9 @@ def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[by
     return b"\r\n".join(raw_lines)
 
 
-def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes:
+def basic_auth_header(
+    username: AnyStr, password: AnyStr, encoding: str = "ISO-8859-1"
+) -> bytes:
     """
     Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
 
@@ -99,4 +101,4 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
     # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
     # seems to be the most widely used encoding here. See also:
     # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
-    return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
+    return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
diff --git a/w3lib/url.py b/w3lib/url.py
diff --git a/w3lib/util.py b/w3lib/util.py