add more type hints

lucywang000 · lucywang000 · commit fe14566aabd6 · 2021-08-02T13:30:13.000+08:00
diff --git a/w3lib/html.py b/w3lib/html.py
@@ -4,6 +4,7 @@
 
 import re
 from html.entities import name2codepoint
+from typing import Match, Sequence, AnyStr
 from urllib.parse import urljoin
 
 from w3lib.util import to_unicode
@@ -18,7 +19,7 @@
 HTML5_WHITESPACE = ' \t\n\r\x0c'
 
 
-def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
+def replace_entities(text: AnyStr, keep: Sequence[str] = (), remove_illegal: bool = True, encoding: str ='utf-8'):
     """Remove entities from the given `text` by converting them to their
     corresponding unicode character.
 
@@ -46,8 +47,9 @@ def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
 
     """
 
-    def convert_entity(m):
+    def convert_entity(m: Match):
         groups = m.groupdict()
+        number = None
         if groups.get('dec'):
             number = int(groups['dec'], 10)
         elif groups.get('hex'):
@@ -78,7 +80,7 @@ def convert_entity(m):
 
     return _ent_re.sub(convert_entity, to_unicode(text, encoding))
 
-def has_entities(text, encoding=None):
+def has_entities(text: AnyStr, encoding=None):
     return bool(_ent_re.search(to_unicode(text, encoding)))
 
 def replace_tags(text, token='', encoding=None):
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -9,8 +9,8 @@
 import re
 import string
 from collections import namedtuple
+from typing import Callable, Optional, Sequence, Tuple, Union, cast, Dict
 from urllib.parse import (
-    _coerce_args,
     parse_qs,
     parse_qsl,
     ParseResult,
@@ -23,13 +23,16 @@
     urlunparse,
     urlunsplit,
 )
+from urllib.parse import _coerce_args  # type: ignore
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_unicode
+from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib._types import AnyUnicodeError, StrOrBytes
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
-def _quote_byte(error):
-    return (quote(error.object[error.start:error.end]), error.end)
+def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
+    error = cast(AnyUnicodeError, error)
+    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
 
 codecs.register_error('percentencode', _quote_byte)
 
@@ -45,7 +48,7 @@ def _quote_byte(error):
 
 _ascii_tab_newline_re = re.compile(r'[\t\n\r]')  # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
 
-def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
+def safe_url_string(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8', quote_path: bool = True) -> str:
     """Convert the given URL into a legal URL by escaping unsafe characters
     according to RFC-3986. Also, ASCII tabs and newlines are removed
     as per https://url.spec.whatwg.org/#url-parsing.
@@ -79,7 +82,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
     try:
         netloc = parts.netloc.encode('idna').decode()
     except UnicodeError:
-        netloc = parts.netloc
+        netloc = parts.netloc.encode('utf-8')
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
@@ -98,7 +101,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
 
 _parent_dirs = re.compile(r'/?(\.\./)+')
 
-def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
+def safe_download_url(url: StrOrBytes, encoding: str ='utf8', path_encoding: str ='utf8') -> str:
     """ Make a url for download. This will call safe_url_string
     and then strip the fragment, if one exists. The path will
     be normalised.
@@ -117,11 +120,11 @@ def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
     return urlunsplit((scheme, netloc, path, query, ''))
 
 
-def is_url(text):
+def is_url(text: str) -> bool:
     return text.partition("://")[0] in ('file', 'http', 'https')
 
 
-def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
+def url_query_parameter(url: StrOrBytes, parameter: str, default: Optional[str] = None, keep_blank_values: Union[bool, int]=0) -> Optional[str]:
     """Return the value of a url parameter, given the url and parameter name
 
     General case:
@@ -152,12 +155,15 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
 
     queryparams = parse_qs(
         urlsplit(str(url))[3],
-        keep_blank_values=keep_blank_values
+        keep_blank_values=bool(keep_blank_values)
     )
-    return queryparams.get(parameter, [default])[0]
+    if parameter in queryparams:
+        return queryparams[parameter][0]
+    else:
+        return default
 
 
-def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
+def url_query_cleaner(url: StrOrBytes, parameterlist: Union[StrOrBytes, Sequence[StrOrBytes]] = (), sep: str = '&', kvsep: str = '=', remove: bool  = False, unique: bool = True, keep_fragments: bool = False) -> str:
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
     >>> import w3lib.url
@@ -192,6 +198,8 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
     if isinstance(parameterlist, (str, bytes)):
         parameterlist = [parameterlist]
     url, fragment = urldefrag(url)
+    url = cast(str, url)
+    fragment = cast(str, fragment)
     base, _, query = url.partition('?')
     seen = set()
     querylist = []
@@ -211,9 +219,9 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
     url = '?'.join([base, sep.join(querylist)]) if querylist else base
     if keep_fragments:
         url += '#' + fragment
-    return url
+    return cast(str, url)
 
-def _add_or_replace_parameters(url, params):
+def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
     parsed = urlsplit(url)
     current_args = parse_qsl(parsed.query, keep_blank_values=True)
 
@@ -233,7 +241,7 @@ def _add_or_replace_parameters(url, params):
     return urlunsplit(parsed._replace(query=query))
 
 
-def add_or_replace_parameter(url, name, new_value):
+def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
     """Add or remove a parameter to a given url
 
     >>> import w3lib.url
@@ -249,7 +257,7 @@ def add_or_replace_parameter(url, name, new_value):
     return _add_or_replace_parameters(url, {name: new_value})
 
 
-def add_or_replace_parameters(url, new_parameters):
+def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
     """Add or remove a parameters to a given url
 
     >>> import w3lib.url
@@ -264,7 +272,7 @@ def add_or_replace_parameters(url, new_parameters):
     return _add_or_replace_parameters(url, new_parameters)
 
 
-def path_to_file_uri(path):
+def path_to_file_uri(path: str) -> str:
     """Convert local filesystem path to legal File URIs as described in:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
@@ -274,15 +282,15 @@ def path_to_file_uri(path):
     return 'file:///%s' % x.lstrip('/')
 
 
-def file_uri_to_path(uri):
+def file_uri_to_path(uri: str) -> str:
     """Convert File URI to local filesystem path according to:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
     uri_path = urlparse(uri).path
     return url2pathname(uri_path)
 
 
-def any_to_uri(uri_or_path):
+def any_to_uri(uri_or_path: str) -> str:
     """If given a path name, return its File URI, otherwise return it
     unmodified
     """
@@ -322,11 +330,11 @@ def any_to_uri(uri_or_path):
                                                     ).encode()
 )
 
-_ParseDataURIResult = namedtuple("ParseDataURIResult",
+_ParseDataURIResult = namedtuple("_ParseDataURIResult",
                                  "media_type media_type_parameters data")
 
 
-def parse_data_uri(uri):
+def parse_data_uri(uri: StrOrBytes) -> _ParseDataURIResult:
     """
 
     Parse a data: URI, returning a 3-tuple of media type, dictionary of media
@@ -368,7 +376,7 @@ def parse_data_uri(uri):
         if m:
             attribute, value, value_quoted = m.groups()
             if value_quoted:
-                value = re.sub(br'\\(.)', r'\1', value_quoted)
+                value = re.sub(br'\\(.)', rb'\1', value_quoted)
             media_type_params[attribute.decode()] = value.decode()
             uri = uri[m.end():]
         else:
@@ -458,7 +466,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
     try:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding=encoding or 'utf8')
-    except UnicodeEncodeError as e:
+    except UnicodeEncodeError:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding='utf8')
 
@@ -551,7 +559,8 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
     # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
     # except for the unquote(s, encoding, errors) calls replaced
     # with unquote_to_bytes(s)
-    qs, _coerce_result = _coerce_args(qs)
+    coerce_args = cast(Callable[..., Tuple[str, Callable]], _coerce_args)
+    qs, _coerce_result = coerce_args(qs)
     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
     r = []
     for name_value in pairs:
diff --git a/w3lib/util.py b/w3lib/util.py
@@ -1,20 +1,15 @@
 from warnings import warn
+from typing import Optional
+from w3lib._types import StrOrBytes
 
-
-def str_to_unicode(text, encoding=None, errors='strict'):
-    warn(
-        "The w3lib.utils.str_to_unicode function is deprecated and "
-        "will be removed in a future release.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
+def str_to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str ='strict') -> str:
     if encoding is None:
         encoding = 'utf-8'
     if isinstance(text, bytes):
         return text.decode(encoding, errors)
     return text
 
-def unicode_to_str(text, encoding=None, errors='strict'):
+def unicode_to_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
     warn(
         "The w3lib.utils.unicode_to_str function is deprecated and "
         "will be removed in a future release.",
@@ -27,7 +22,7 @@ def unicode_to_str(text, encoding=None, errors='strict'):
         return text.encode(encoding, errors)
     return text
 
-def to_unicode(text, encoding=None, errors='strict'):
+def to_unicode(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
     """Return the unicode representation of a bytes object `text`. If `text`
     is already an unicode object, return it as-is."""
     if isinstance(text, str):
@@ -40,7 +35,7 @@ def to_unicode(text, encoding=None, errors='strict'):
         encoding = 'utf-8'
     return text.decode(encoding, errors)
 
-def to_bytes(text, encoding=None, errors='strict'):
+def to_bytes(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> bytes:
     """Return the binary representation of `text`. If `text`
     is already a bytes object, return it as-is."""
     if isinstance(text, bytes):
@@ -53,7 +48,7 @@ def to_bytes(text, encoding=None, errors='strict'):
         encoding = 'utf-8'
     return text.encode(encoding, errors)
 
-def to_native_str(text, encoding=None, errors='strict'):
+def to_native_str(text: StrOrBytes, encoding: Optional[str] = None, errors: str = 'strict') -> str:
     """ Return str representation of `text` """
     warn(
         "The w3lib.utils.to_native_str function is deprecated and "