99import re
1010import string
1111from collections import namedtuple
12+ from typing import Callable , Optional , Sequence , Tuple , Union , cast , Dict
1213from urllib .parse import (
13- _coerce_args ,
1414 parse_qs ,
1515 parse_qsl ,
1616 ParseResult ,
2323 urlunparse ,
2424 urlunsplit ,
2525)
26+ from urllib .parse import _coerce_args # type: ignore
2627from urllib .request import pathname2url , url2pathname
27- from w3lib .util import to_unicode
28+ from w3lib .util import to_bytes , to_native_str , to_unicode
29+ from w3lib ._types import AnyUnicodeError , StrOrBytes
2830
2931
3032# error handling function for bytes-to-Unicode decoding errors with URLs
31- def _quote_byte (error ):
32- return (quote (error .object [error .start :error .end ]), error .end )
33+ def _quote_byte (error : UnicodeError ) -> Tuple [str , int ]:
34+ error = cast (AnyUnicodeError , error )
35+ return (to_unicode (quote (error .object [error .start :error .end ])), error .end )
3336
3437codecs .register_error ('percentencode' , _quote_byte )
3538
@@ -45,7 +48,7 @@ def _quote_byte(error):
4548
4649_ascii_tab_newline_re = re .compile (r'[\t\n\r]' ) # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
4750
48- def safe_url_string (url , encoding = 'utf8' , path_encoding = 'utf8' , quote_path = True ):
51+ def safe_url_string (url : StrOrBytes , encoding : str = 'utf8' , path_encoding : str = 'utf8' , quote_path : bool = True ) -> str :
4952 """Convert the given URL into a legal URL by escaping unsafe characters
5053 according to RFC-3986. Also, ASCII tabs and newlines are removed
5154 as per https://url.spec.whatwg.org/#url-parsing.
@@ -79,7 +82,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
7982 try :
8083 netloc = parts .netloc .encode ('idna' ).decode ()
8184 except UnicodeError :
82- netloc = parts .netloc
85+ netloc = parts .netloc . encode ( 'utf-8' )
8386
8487 # default encoding for path component SHOULD be UTF-8
8588 if quote_path :
@@ -98,7 +101,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
98101
99102_parent_dirs = re .compile (r'/?(\.\./)+' )
100103
101- def safe_download_url (url , encoding = 'utf8' , path_encoding = 'utf8' ):
104+ def safe_download_url (url : StrOrBytes , encoding : str = 'utf8' , path_encoding : str = 'utf8' ) -> str :
102105 """ Make a url for download. This will call safe_url_string
103106 and then strip the fragment, if one exists. The path will
104107 be normalised.
@@ -117,11 +120,11 @@ def safe_download_url(url, encoding='utf8', path_encoding='utf8'):
117120 return urlunsplit ((scheme , netloc , path , query , '' ))
118121
119122
120- def is_url (text ) :
123+ def is_url (text : str ) -> bool :
121124 return text .partition ("://" )[0 ] in ('file' , 'http' , 'https' )
122125
123126
124- def url_query_parameter (url , parameter , default = None , keep_blank_values = 0 ):
127+ def url_query_parameter (url : StrOrBytes , parameter : str , default : Optional [ str ] = None , keep_blank_values : Union [ bool , int ] = 0 ) -> Optional [ str ] :
125128 """Return the value of a url parameter, given the url and parameter name
126129
127130 General case:
@@ -152,12 +155,15 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
152155
153156 queryparams = parse_qs (
154157 urlsplit (str (url ))[3 ],
155- keep_blank_values = keep_blank_values
158+ keep_blank_values = bool ( keep_blank_values )
156159 )
157- return queryparams .get (parameter , [default ])[0 ]
160+ if parameter in queryparams :
161+ return queryparams [parameter ][0 ]
162+ else :
163+ return default
158164
159165
160- def url_query_cleaner (url , parameterlist = (), sep = '&' , kvsep = '=' , remove = False , unique = True , keep_fragments = False ):
166+ def url_query_cleaner (url : StrOrBytes , parameterlist : Union [ StrOrBytes , Sequence [ StrOrBytes ]] = (), sep : str = '&' , kvsep : str = '=' , remove : bool = False , unique : bool = True , keep_fragments : bool = False ) -> str :
161167 """Clean URL arguments leaving only those passed in the parameterlist keeping order
162168
163169 >>> import w3lib.url
@@ -192,6 +198,8 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
192198 if isinstance (parameterlist , (str , bytes )):
193199 parameterlist = [parameterlist ]
194200 url , fragment = urldefrag (url )
201+ url = cast (str , url )
202+ fragment = cast (str , fragment )
195203 base , _ , query = url .partition ('?' )
196204 seen = set ()
197205 querylist = []
@@ -211,9 +219,9 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
211219 url = '?' .join ([base , sep .join (querylist )]) if querylist else base
212220 if keep_fragments :
213221 url += '#' + fragment
214- return url
222+ return cast ( str , url )
215223
216- def _add_or_replace_parameters (url , params ) :
224+ def _add_or_replace_parameters (url : str , params : Dict [ str , str ]) -> str :
217225 parsed = urlsplit (url )
218226 current_args = parse_qsl (parsed .query , keep_blank_values = True )
219227
@@ -233,7 +241,7 @@ def _add_or_replace_parameters(url, params):
233241 return urlunsplit (parsed ._replace (query = query ))
234242
235243
236- def add_or_replace_parameter (url , name , new_value ) :
244+ def add_or_replace_parameter (url : str , name : str , new_value : str ) -> str :
237245 """Add or remove a parameter to a given url
238246
239247 >>> import w3lib.url
@@ -249,7 +257,7 @@ def add_or_replace_parameter(url, name, new_value):
249257 return _add_or_replace_parameters (url , {name : new_value })
250258
251259
252- def add_or_replace_parameters (url , new_parameters ) :
260+ def add_or_replace_parameters (url : str , new_parameters : Dict [ str , str ]) -> str :
253261 """Add or remove a parameters to a given url
254262
255263 >>> import w3lib.url
@@ -264,7 +272,7 @@ def add_or_replace_parameters(url, new_parameters):
264272 return _add_or_replace_parameters (url , new_parameters )
265273
266274
267- def path_to_file_uri (path ) :
275+ def path_to_file_uri (path : str ) -> str :
268276 """Convert local filesystem path to legal File URIs as described in:
269277 http://en.wikipedia.org/wiki/File_URI_scheme
270278 """
@@ -274,15 +282,15 @@ def path_to_file_uri(path):
274282 return 'file:///%s' % x .lstrip ('/' )
275283
276284
277- def file_uri_to_path (uri ) :
285+ def file_uri_to_path (uri : str ) -> str :
278286 """Convert File URI to local filesystem path according to:
279287 http://en.wikipedia.org/wiki/File_URI_scheme
280288 """
281289 uri_path = urlparse (uri ).path
282290 return url2pathname (uri_path )
283291
284292
285- def any_to_uri (uri_or_path ) :
293+ def any_to_uri (uri_or_path : str ) -> str :
286294 """If given a path name, return its File URI, otherwise return it
287295 unmodified
288296 """
@@ -322,11 +330,11 @@ def any_to_uri(uri_or_path):
322330 ).encode ()
323331)
324332
325- _ParseDataURIResult = namedtuple ("ParseDataURIResult " ,
333+ _ParseDataURIResult = namedtuple ("_ParseDataURIResult " ,
326334 "media_type media_type_parameters data" )
327335
328336
329- def parse_data_uri (uri ) :
337+ def parse_data_uri (uri : StrOrBytes ) -> _ParseDataURIResult :
330338 """
331339
332340 Parse a data: URI, returning a 3-tuple of media type, dictionary of media
@@ -368,7 +376,7 @@ def parse_data_uri(uri):
368376 if m :
369377 attribute , value , value_quoted = m .groups ()
370378 if value_quoted :
371- value = re .sub (br'\\(.)' , r '\1' , value_quoted )
379+ value = re .sub (br'\\(.)' , rb '\1' , value_quoted )
372380 media_type_params [attribute .decode ()] = value .decode ()
373381 uri = uri [m .end ():]
374382 else :
@@ -458,7 +466,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
458466 try :
459467 scheme , netloc , path , params , query , fragment = _safe_ParseResult (
460468 parse_url (url ), encoding = encoding or 'utf8' )
461- except UnicodeEncodeError as e :
469+ except UnicodeEncodeError :
462470 scheme , netloc , path , params , query , fragment = _safe_ParseResult (
463471 parse_url (url ), encoding = 'utf8' )
464472
@@ -551,7 +559,8 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
551559 # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a)
552560 # except for the unquote(s, encoding, errors) calls replaced
553561 # with unquote_to_bytes(s)
554- qs , _coerce_result = _coerce_args (qs )
562+ coerce_args = cast (Callable [..., Tuple [str , Callable ]], _coerce_args )
563+ qs , _coerce_result = coerce_args (qs )
555564 pairs = [s2 for s1 in qs .split ('&' ) for s2 in s1 .split (';' )]
556565 r = []
557566 for name_value in pairs :
0 commit comments