Skip to content

Commit fe14566

Browse files
committed
add more type hints
1 parent 8cb0d1e commit fe14566

File tree

3 files changed

+45
-39
lines changed

3 files changed

+45
-39
lines changed

w3lib/html.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import re
66
from html.entities import name2codepoint
7+
from typing import Match, Sequence, AnyStr
78
from urllib.parse import urljoin
89

910
from w3lib.util import to_unicode
@@ -18,7 +19,7 @@
1819
HTML5_WHITESPACE = ' \t\n\r\x0c'
1920

2021

21-
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
22+
def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'):
2223
"""Remove entities from the given `text` by converting them to their
2324
corresponding unicode character.
2425
@@ -46,8 +47,9 @@ def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
4647
4748
"""
4849

49-
def convert_entity(m):
50+
def convert_entity(m: Match):
5051
groups = m.groupdict()
52+
number = None
5153
if groups.get('dec'):
5254
number = int(groups['dec'], 10)
5355
elif groups.get('hex'):
@@ -78,7 +80,7 @@ def convert_entity(m):
7880

7981
return _ent_re.sub(convert_entity, to_unicode(text, encoding))
8082

81-
def has_entities(text, encoding=None):
83+
def has_entities(text: AnyStr, encoding=None):
8284
return bool(_ent_re.search(to_unicode(text, encoding)))
8385

8486
def replace_tags(text, token='', encoding=None):

w3lib/url.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import re
1010
import string
1111
from collections import namedtuple
12+
from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
1213
from urllib.parse import (
13-
_coerce_args,
1414
parse_qs,
1515
parse_qsl,
1616
ParseResult,
@@ -23,13 +23,16 @@
2323
urlunparse,
2424
urlunsplit,
2525
)
26+
from urllib.parse import _coerce_args # type: ignore
2627
from urllib.request import pathname2url, url2pathname
27-
from w3lib.util import to_unicode
28+
from w3lib.util import to_bytes, to_native_str, to_unicode
29+
from w3lib._types import AnyUnicodeError, StrOrBytes
2830

2931

3032
# error handling function for bytes-to-Unicode decoding errors with URLs
31-
def _quote_byte(error):
32-
return (quote(error.object[error.start:error.end]), error.end)
33+
def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
34+
error = cast(AnyUnicodeError, error)
35+
return (to_unicode(quote(error.object[error.start:error.end])), error.end)
3336

3437
codecs.register_error('percentencode', _quote_byte)
3538

@@ -45,7 +48,7 @@ def _quote_byte(error):
4548

4649
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
4750

48-
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
51+
def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str:
4952
"""Convert the given URL into a legal URL by escaping unsafe characters
5053
according to RFC-3986. Also, ASCII tabs and newlines are removed
5154
as per https://url.spec.whatwg.org/#url-parsing.
@@ -79,7 +82,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
7982
try:
8083
netloc = parts.netloc.encode('idna').decode()
8184
except UnicodeError:
82-
netloc = parts.netloc
85+
netloc = parts.netloc.encode('utf-8')
8386

8487
# default encoding for path component SHOULD be UTF-8
8588
if quote_path:
@@ -98,7 +101,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
98101

99102
_parent_dirs = re.compile(r'/?(\.\./)+')
100103

101-
def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
104+
def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str:
102105
""" Make a url for download. This will call safe_url_string
103106
and then strip the fragment, if one exists. The path will
104107
be normalised.
@@ -117,11 +120,11 @@ def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
117120
return urlunsplit((scheme, netloc, path, query, ''))
118121

119122

120-
def is_url(text):
123+
def is_url(text: str) -> bool:
121124
return text.partition("://")[0] in ('file', 'http', 'https')
122125

123126

124-
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
127+
def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]:
125128
"""Return the value of a url parameter, given the url and parameter name
126129
127130
General case:
@@ -152,12 +155,15 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
152155

153156
queryparams = parse_qs(
154157
urlsplit(str(url))[3],
155-
keep_blank_values=keep_blank_values
158+
keep_blank_values=bool(keep_blank_values)
156159
)
157-
return queryparams.get(parameter, [default])[0]
160+
if parameter in queryparams:
161+
return queryparams[parameter][0]
162+
else:
163+
return default
158164

159165

160-
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
166+
def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool = False, unique: bool = True, keep_fragments: bool = False) -> str:
161167
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
162168
163169
>>> import w3lib.url
@@ -192,6 +198,8 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
192198
if isinstance(parameterlist, (str, bytes)):
193199
parameterlist = [parameterlist]
194200
url, fragment = urldefrag(url)
201+
url = cast(str, url)
202+
fragment = cast(str, fragment)
195203
base, _, query = url.partition('?')
196204
seen = set()
197205
querylist = []
@@ -211,9 +219,9 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
211219
url = '?'.join([base, sep.join(querylist)]) if querylist else base
212220
if keep_fragments:
213221
url += '#' + fragment
214-
return url
222+
return cast(str, url)
215223

216-
def _add_or_replace_parameters(url, params):
224+
def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
217225
parsed = urlsplit(url)
218226
current_args = parse_qsl(parsed.query, keep_blank_values=True)
219227

@@ -233,7 +241,7 @@ def _add_or_replace_parameters(url, params):
233241
return urlunsplit(parsed._replace(query=query))
234242

235243

236-
def add_or_replace_parameter(url, name, new_value):
244+
def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
237245
"""Add or remove a parameter to a given url
238246
239247
>>> import w3lib.url
@@ -249,7 +257,7 @@ def add_or_replace_parameter(url, name, new_value):
249257
return _add_or_replace_parameters(url, {name: new_value})
250258

251259

252-
def add_or_replace_parameters(url, new_parameters):
260+
def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
253261
"""Add or remove a parameters to a given url
254262
255263
>>> import w3lib.url
@@ -264,7 +272,7 @@ def add_or_replace_parameters(url, new_parameters):
264272
return _add_or_replace_parameters(url, new_parameters)
265273

266274

267-
def path_to_file_uri(path):
275+
def path_to_file_uri(path: str) -> str:
268276
"""Convert local filesystem path to legal File URIs as described in:
269277
http://en.wikipedia.org/wiki/File_URI_scheme
270278
"""
@@ -274,15 +282,15 @@ def path_to_file_uri(path):
274282
return 'file:///%s' % x.lstrip('/')
275283

276284

277-
def file_uri_to_path(uri):
285+
def file_uri_to_path(uri: str) -> str:
278286
"""Convert File URI to local filesystem path according to:
279287
http://en.wikipedia.org/wiki/File_URI_scheme
280288
"""
281289
uri_path = urlparse(uri).path
282290
return url2pathname(uri_path)
283291

284292

285-
def any_to_uri(uri_or_path):
293+
def any_to_uri(uri_or_path: str) -> str:
286294
"""If given a path name, return its File URI, otherwise return it
287295
unmodified
288296
"""
@@ -322,11 +330,11 @@ def any_to_uri(uri_or_path):
322330
).encode()
323331
)
324332

325-
_ParseDataURIResult = namedtuple("ParseDataURIResult",
333+
_ParseDataURIResult = namedtuple("_ParseDataURIResult",
326334
"media_type media_type_parameters data")
327335

328336

329-
def parse_data_uri(uri):
337+
def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
330338
"""
331339
332340
Parse a data: URI, returning a 3-tuple of media type, dictionary of media
@@ -368,7 +376,7 @@ def parse_data_uri(uri):
368376
if m:
369377
attribute, value, value_quoted = m.groups()
370378
if value_quoted:
371-
value = re.sub(br'\\(.)', r'\1', value_quoted)
379+
value = re.sub(br'\\(.)', rb'\1', value_quoted)
372380
media_type_params[attribute.decode()] = value.decode()
373381
uri = uri[m.end():]
374382
else:
@@ -458,7 +466,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
458466
try:
459467
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
460468
parse_url(url), encoding=encoding or 'utf8')
461-
except UnicodeEncodeError as e:
469+
except UnicodeEncodeError:
462470
scheme, netloc, path, params, query, fragment = _safe_ParseResult(
463471
parse_url(url), encoding='utf8')
464472

@@ -551,7 +559,8 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
551559
# (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
552560
# except for the unquote(s, encoding, errors) calls replaced
553561
# with unquote_to_bytes(s)
554-
qs, _coerce_result = _coerce_args(qs)
562+
coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args)
563+
qs, _coerce_result = coerce_args(qs)
555564
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
556565
r = []
557566
for name_value in pairs:

w3lib/util.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,15 @@
11
from warnings import warn
2+
from typing import Optional
3+
from w3lib._types import StrOrBytes
24

3-
4-
def str_to_unicode(text, encoding=None, errors='strict'):
5-
warn(
6-
"The w3lib.utils.str_to_unicode function is deprecated and "
7-
"will be removed in a future release.",
8-
DeprecationWarning,
9-
stacklevel=2,
10-
)
5+
def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str:
116
if encoding is None:
127
encoding = 'utf-8'
138
if isinstance(text, bytes):
149
return text.decode(encoding, errors)
1510
return text
1611

17-
def unicode_to_str(text, encoding=None, errors='strict'):
12+
def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
1813
warn(
1914
"The w3lib.utils.unicode_to_str function is deprecated and "
2015
"will be removed in a future release.",
@@ -27,7 +22,7 @@ def unicode_to_str(text, encoding=None, errors='strict'):
2722
return text.encode(encoding, errors)
2823
return text
2924

30-
def to_unicode(text, encoding=None, errors='strict'):
25+
def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
3126
"""Return the unicode representation of a bytes object `text`. If `text`
3227
is already an unicode object, return it as-is."""
3328
if isinstance(text, str):
@@ -40,7 +35,7 @@ def to_unicode(text, encoding=None, errors='strict'):
4035
encoding = 'utf-8'
4136
return text.decode(encoding, errors)
4237

43-
def to_bytes(text, encoding=None, errors='strict'):
38+
def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
4439
"""Return the binary representation of `text`. If `text`
4540
is already a bytes object, return it as-is."""
4641
if isinstance(text, bytes):
@@ -53,7 +48,7 @@ def to_bytes(text, encoding=None, errors='strict'):
5348
encoding = 'utf-8'
5449
return text.encode(encoding, errors)
5550

56-
def to_native_str(text, encoding=None, errors='strict'):
51+
def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
5752
""" Return str representation of `text` """
5853
warn(
5954
"The w3lib.utils.to_native_str function is deprecated and "

0 commit comments

Comments
 (0)