Skip to content

Commit 5fae2b9

Browse files
committed
black
1 parent 78c82fd commit 5fae2b9

File tree

8 files changed

+195
-102
lines changed

8 files changed

+195
-102
lines changed

tests/test_encoding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def test_bom(self):
4040
bom_encoding, bom = read_bom(string)
4141
assert bom_encoding is not None
4242
assert bom is not None
43-
decoded = string[len(bom):].decode(bom_encoding)
43+
decoded = string[len(bom) :].decode(bom_encoding)
4444
self.assertEqual(water_unicode, decoded)
4545
# Body without BOM
4646
enc, bom = read_bom(b"foo")

tests/test_http.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import unittest
22
from collections import OrderedDict
3-
from w3lib.http import (HeadersDictInput, basic_auth_header,
4-
headers_dict_to_raw, headers_raw_to_dict)
3+
from w3lib.http import (
4+
HeadersDictInput,
5+
basic_auth_header,
6+
headers_dict_to_raw,
7+
headers_raw_to_dict,
8+
)
59

610
__doctests__ = ["w3lib.http"] # for trial support
711

@@ -48,10 +52,9 @@ def test_headers_dict_to_raw(self):
4852
)
4953

5054
def test_headers_dict_to_raw_listtuple(self):
51-
dct: HeadersDictInput = OrderedDict([
52-
(b'Content-type', [b'text/html']),
53-
(b'Accept', [b'gzip'])
54-
])
55+
dct: HeadersDictInput = OrderedDict(
56+
[(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]
57+
)
5558
self.assertEqual(
5659
headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip"
5760
)
@@ -74,14 +77,13 @@ def test_headers_dict_to_raw_listtuple(self):
7477
)
7578

7679
def test_headers_dict_to_raw_wrong_values(self):
77-
dct: HeadersDictInput = OrderedDict([
78-
(b'Content-type', 0),
79-
])
80-
self.assertEqual(
81-
headers_dict_to_raw(dct),
82-
b''
80+
dct: HeadersDictInput = OrderedDict(
81+
[
82+
(b"Content-type", 0),
83+
]
8384
)
8485
self.assertEqual(headers_dict_to_raw(dct), b"")
86+
self.assertEqual(headers_dict_to_raw(dct), b"")
8587

8688
dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
8789
self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")

tests/test_url.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -506,10 +506,10 @@ def test_add_or_replace_parameters(self):
506506
)
507507

508508
def test_add_or_replace_parameters_does_not_change_input_param(self):
509-
url = 'http://domain/test?arg=original'
510-
input_param = {'arg': 'value'}
509+
url = "http://domain/test?arg=original"
510+
input_param = {"arg": "value"}
511511
add_or_replace_parameters(url, input_param) # noqa
512-
self.assertEqual(input_param, {'arg': 'value'})
512+
self.assertEqual(input_param, {"arg": "value"})
513513

514514
def test_url_query_cleaner(self):
515515
self.assertEqual("product.html", url_query_cleaner("product.html?"))
@@ -814,17 +814,25 @@ def test_normalize_percent_encoding_in_query_arguments(self):
814814
)
815815

816816
def test_non_ascii_percent_encoding_in_paths(self):
817-
self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
818-
"http://www.example.com/a%20do?a=1")
817+
self.assertEqual(
818+
canonicalize_url("http://www.example.com/a do?a=1"),
819+
"http://www.example.com/a%20do?a=1",
820+
)
819821

820-
self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
821-
"http://www.example.com/a%20%20do?a=1")
822+
self.assertEqual(
823+
canonicalize_url("http://www.example.com/a %20do?a=1"),
824+
"http://www.example.com/a%20%20do?a=1",
825+
)
822826

823-
self.assertEqual(canonicalize_url("http://www.example.com/a do£.html?a=1"),
824-
"http://www.example.com/a%20do%C2%A3.html?a=1")
827+
self.assertEqual(
828+
canonicalize_url("http://www.example.com/a do£.html?a=1"),
829+
"http://www.example.com/a%20do%C2%A3.html?a=1",
830+
)
825831

826-
self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
827-
"http://www.example.com/a%20do%C2%A3.html?a=1")
832+
self.assertEqual(
833+
canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
834+
"http://www.example.com/a%20do%C2%A3.html?a=1",
835+
)
828836

829837
def test_non_ascii_percent_encoding_in_query_arguments(self):
830838
self.assertEqual(

w3lib/encoding.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
2626

2727
return None
2828

29+
2930
# regexp for parsing HTTP meta tags
3031
_TEMPLATE = r"""%s\s*=\s*["']?\s*%s\s*["']?"""
3132
_SKIP_ATTRS = """(?:\\s+
@@ -124,6 +125,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
124125
"zh_cn": "gb18030",
125126
}
126127

128+
127129
def _c18n_encoding(encoding: str) -> str:
128130
"""Canonicalize an encoding name
129131
@@ -195,7 +197,9 @@ def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
195197

196198
# Python decoder doesn't follow unicode standard when handling
197199
# bad utf-8 encoded strings. see http://bugs.python.org/issue8271
198-
codecs.register_error('w3lib_replace', lambda exc: ('\ufffd', cast(AnyUnicodeError, exc).end))
200+
codecs.register_error(
201+
"w3lib_replace", lambda exc: ("\ufffd", cast(AnyUnicodeError, exc).end)
202+
)
199203

200204

201205
def to_unicode(data_str: bytes, encoding: str) -> str:
@@ -209,8 +213,12 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
209213
)
210214

211215

212-
def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
213-
default_encoding: str = 'utf8', auto_detect_fun: Optional[Callable[[bytes], str]] = None) -> Tuple[str, str]:
216+
def html_to_unicode(
217+
content_type_header: Optional[str],
218+
html_body_str: bytes,
219+
default_encoding: str = "utf8",
220+
auto_detect_fun: Optional[Callable[[bytes], str]] = None,
221+
) -> Tuple[str, str]:
214222
r'''Convert raw html bytes to unicode
215223
216224
This attempts to make a reasonable guess at the content encoding of the
@@ -279,20 +287,20 @@ def html_to_unicode(content_type_header: Optional[str], html_body_str: bytes,
279287
# remove BOM if it agrees with the encoding
280288
if enc == bom_enc:
281289
bom = cast(bytes, bom)
282-
html_body_str = html_body_str[len(bom):]
283-
elif enc == 'utf-16' or enc == 'utf-32':
290+
html_body_str = html_body_str[len(bom) :]
291+
elif enc == "utf-16" or enc == "utf-32":
284292
# read endianness from BOM, or default to big endian
285293
# tools.ietf.org/html/rfc2781 section 4.3
286294
if bom_enc is not None and bom_enc.startswith(enc):
287295
enc = bom_enc
288296
bom = cast(bytes, bom)
289-
html_body_str = html_body_str[len(bom):]
297+
html_body_str = html_body_str[len(bom) :]
290298
else:
291299
enc += "-be"
292300
return enc, to_unicode(html_body_str, enc)
293301
if bom_enc is not None:
294302
bom = cast(bytes, bom)
295-
return bom_enc, to_unicode(html_body_str[len(bom):], bom_enc)
303+
return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
296304
enc = html_body_declared_encoding(html_body_str)
297305
if enc is None and (auto_detect_fun is not None):
298306
enc = auto_detect_fun(html_body_str)

w3lib/html.py

Lines changed: 69 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,29 @@
1111
from w3lib.url import safe_url_string
1212
from w3lib._types import StrOrBytes
1313

14-
_ent_re = re.compile(r'&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)', re.IGNORECASE)
15-
_tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL)
16-
_baseurl_re = re.compile(r'<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I)
17-
_meta_refresh_re = re.compile(r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE)
18-
_cdata_re = re.compile(r'((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))', re.DOTALL)
19-
20-
HTML5_WHITESPACE = ' \t\n\r\x0c'
21-
22-
23-
def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: str = 'utf-8') -> str:
14+
_ent_re = re.compile(
15+
r"&((?P<named>[a-z\d]+)|#(?P<dec>\d+)|#x(?P<hex>[a-f\d]+))(?P<semicolon>;?)",
16+
re.IGNORECASE,
17+
)
18+
_tag_re = re.compile(r"<[a-zA-Z\/!].*?>", re.DOTALL)
19+
_baseurl_re = re.compile(r"<base\s[^>]*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']", re.I)
20+
_meta_refresh_re = re.compile(
21+
r'<meta\s[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>(\d*\.)?\d+)\s*;\s*url=\s*(?P<url>.*?)(?P=quote)',
22+
re.DOTALL | re.IGNORECASE,
23+
)
24+
_cdata_re = re.compile(
25+
r"((?P<cdata_s><!\[CDATA\[)(?P<cdata_d>.*?)(?P<cdata_e>\]\]>))", re.DOTALL
26+
)
27+
28+
HTML5_WHITESPACE = " \t\n\r\x0c"
29+
30+
31+
def replace_entities(
32+
text: AnyStr,
33+
keep: Iterable[str] = (),
34+
remove_illegal: bool = True,
35+
encoding: str = "utf-8",
36+
) -> str:
2437
"""Remove entities from the given `text` by converting them to their
2538
corresponding unicode character.
2639
@@ -51,12 +64,12 @@ def replace_entities(text: AnyStr, keep: Iterable[str] = (), remove_illegal: boo
5164
def convert_entity(m: Match) -> str:
5265
groups = m.groupdict()
5366
number = None
54-
if groups.get('dec'):
55-
number = int(groups['dec'], 10)
56-
elif groups.get('hex'):
57-
number = int(groups['hex'], 16)
58-
elif groups.get('named'):
59-
entity_name = groups['named']
67+
if groups.get("dec"):
68+
number = int(groups["dec"], 10)
69+
elif groups.get("hex"):
70+
number = int(groups["hex"], 16)
71+
elif groups.get("named"):
72+
entity_name = groups["named"]
6073
if entity_name.lower() in keep:
6174
return m.group(0)
6275
else:
@@ -80,11 +93,12 @@ def convert_entity(m: Match) -> str:
8093

8194
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
8295

96+
8397
def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
8498
return bool(_ent_re.search(to_unicode(text, encoding)))
8599

86100

87-
def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None) -> str:
101+
def replace_tags(text: AnyStr, token: str = "", encoding: Optional[str] = None) -> str:
88102
"""Replace all markup tags found in the given `text` by the given token.
89103
By default `token` is an empty string so it just removes all tags.
90104
@@ -107,11 +121,11 @@ def replace_tags(text: AnyStr, token: str = '', encoding: Optional[str] = None)
107121
return _tag_re.sub(token, to_unicode(text, encoding))
108122

109123

110-
_REMOVECOMMENTS_RE = re.compile('<!--.*?(?:-->|$)', re.DOTALL)
124+
_REMOVECOMMENTS_RE = re.compile("<!--.*?(?:-->|$)", re.DOTALL)
111125

112126

113127
def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
114-
""" Remove HTML Comments.
128+
"""Remove HTML Comments.
115129
116130
>>> import w3lib.html
117131
>>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
@@ -121,10 +135,16 @@ def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
121135
"""
122136

123137
utext = to_unicode(text, encoding)
124-
return _REMOVECOMMENTS_RE.sub('', utext)
138+
return _REMOVECOMMENTS_RE.sub("", utext)
139+
125140

126-
def remove_tags(text: AnyStr, which_ones: Iterable[str] = (), keep: Iterable[str] = (), encoding: Optional[str] = None) -> str:
127-
""" Remove HTML Tags only.
141+
def remove_tags(
142+
text: AnyStr,
143+
which_ones: Iterable[str] = (),
144+
keep: Iterable[str] = (),
145+
encoding: Optional[str] = None,
146+
) -> str:
147+
"""Remove HTML Tags only.
128148
129149
`which_ones` and `keep` are both tuples, there are four cases:
130150
@@ -190,7 +210,9 @@ def remove_tag(m: Match) -> str:
190210
return retags.sub(remove_tag, to_unicode(text, encoding))
191211

192212

193-
def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None) -> str:
213+
def remove_tags_with_content(
214+
text: AnyStr, which_ones: Iterable[str] = (), encoding: Optional[str] = None
215+
) -> str:
194216
"""Remove tags and their content.
195217
196218
`which_ones` is a tuple of which tags to remove including their content.
@@ -210,12 +232,16 @@ def remove_tags_with_content(text: AnyStr, which_ones: Iterable[str] = (), encod
210232
[r"<%s\b.*?</%s>|<%s\s*/>" % (tag, tag, tag) for tag in which_ones]
211233
)
212234
retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
213-
utext = retags.sub('', utext)
235+
utext = retags.sub("", utext)
214236
return utext
215237

216238

217-
def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t', '\r'), replace_by: StrOrBytes = '', \
218-
encoding: Optional[str] = None) -> str:
239+
def replace_escape_chars(
240+
text: AnyStr,
241+
which_ones: Iterable[str] = ("\n", "\t", "\r"),
242+
replace_by: StrOrBytes = "",
243+
encoding: Optional[str] = None,
244+
) -> str:
219245
"""Remove escape characters.
220246
221247
`which_ones` is a tuple of which escape characters we want to remove.
@@ -232,7 +258,12 @@ def replace_escape_chars(text: AnyStr, which_ones: Iterable[str] = ('\n', '\t',
232258
return utext
233259

234260

235-
def unquote_markup(text: AnyStr, keep: Iterable[str] = (), remove_illegal: bool = True, encoding: Optional[str] = None) -> str:
261+
def unquote_markup(
262+
text: AnyStr,
263+
keep: Iterable[str] = (),
264+
remove_illegal: bool = True,
265+
encoding: Optional[str] = None,
266+
) -> str:
236267
"""
237268
This function receives markup as a text (always a unicode string or
238269
a UTF-8 encoded string) and does the following:
@@ -254,7 +285,7 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
254285
yield txt[offset:]
255286

256287
utext = to_unicode(text, encoding)
257-
ret_text = ''
288+
ret_text = ""
258289
for fragment in _get_fragments(utext, _cdata_re):
259290
if isinstance(fragment, str):
260291
# it's not a CDATA (so we try to remove its entities)
@@ -266,7 +297,10 @@ def _get_fragments(txt: str, pattern: Pattern) -> Iterable[Union[str, Match]]:
266297
ret_text += fragment.group("cdata_d")
267298
return ret_text
268299

269-
def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8') -> str:
300+
301+
def get_base_url(
302+
text: AnyStr, baseurl: StrOrBytes = "", encoding: str = "utf-8"
303+
) -> str:
270304
"""Return the base url if declared in the given HTML `text`,
271305
relative to the given base url.
272306
@@ -284,7 +318,12 @@ def get_base_url(text: AnyStr, baseurl: StrOrBytes = '', encoding: str = 'utf-8'
284318
return safe_url_string(baseurl)
285319

286320

287-
def get_meta_refresh(text: AnyStr, baseurl: str = '', encoding: str = 'utf-8', ignore_tags: Iterable[str] = ('script', 'noscript')) -> Tuple[Optional[float], Optional[str]]:
321+
def get_meta_refresh(
322+
text: AnyStr,
323+
baseurl: str = "",
324+
encoding: str = "utf-8",
325+
ignore_tags: Iterable[str] = ("script", "noscript"),
326+
) -> Tuple[Optional[float], Optional[str]]:
288327
"""Return the http-equiv parameter of the HTML meta element from the given
289328
HTML text and return a tuple ``(interval, url)`` where interval is an integer
290329
containing the delay in seconds (or zero if not present) and url is a

w3lib/http.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ def headers_dict_to_raw(headers_dict: Optional[HeadersDictInput]) -> Optional[by
8383
return b"\r\n".join(raw_lines)
8484

8585

86-
def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8859-1') -> bytes:
86+
def basic_auth_header(
87+
username: AnyStr, password: AnyStr, encoding: str = "ISO-8859-1"
88+
) -> bytes:
8789
"""
8890
Return an `Authorization` header field value for `HTTP Basic Access Authentication (RFC 2617)`_
8991
@@ -99,4 +101,4 @@ def basic_auth_header(username: AnyStr, password: AnyStr, encoding: str = 'ISO-8
99101
# XXX: RFC 2617 doesn't define encoding, but ISO-8859-1
100102
# seems to be the most widely used encoding here. See also:
101103
# http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html
102-
return b'Basic ' + urlsafe_b64encode(to_bytes(auth, encoding=encoding))
104+
return b"Basic " + urlsafe_b64encode(to_bytes(auth, encoding=encoding))

0 commit comments

Comments
 (0)