88import re
99import posixpath
1010import warnings
11- import six
11+ import string
1212from collections import namedtuple
13+ import six
1314from six .moves .urllib .parse import (urljoin , urlsplit , urlunsplit ,
1415 urldefrag , urlencode , urlparse ,
1516 quote , parse_qs , parse_qsl ,
@@ -24,51 +25,14 @@ def _quote_byte(error):
2425
2526codecs .register_error ('percentencode' , _quote_byte )
2627
28+ # constants from RFC 3986, Section 2.2 and 2.3
29+ RFC3986_GEN_DELIMS = b':/?#[]@'
30+ RFC3986_SUB_DELIMS = b"!$&'()*+,;="
31+ RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
32+ RFC3986_UNRESERVED = (string .ascii_letters + string .digits + "-._~" ).encode ('ascii' )
33+ EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25
2734
28- # Python 2.x urllib.always_safe become private in Python 3.x;
29- # its content is copied here
30- _ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
31- b'abcdefghijklmnopqrstuvwxyz'
32- b'0123456789' b'_.-' )
33-
34-
35- def urljoin_rfc (base , ref , encoding = 'utf-8' ):
36- r"""
37- .. warning::
38-
39- This function is deprecated and will be removed in future.
40- It is not supported with Python 3.
41- Please use ``urlparse.urljoin`` instead.
42-
43- Same as urlparse.urljoin but supports unicode values in base and ref
44- parameters (in which case they will be converted to str using the given
45- encoding).
46-
47- Always returns a str.
48-
49- >>> import w3lib.url
50- >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
51- 'http://www.example.com/otherpath/index2.html'
52- >>>
53-
54- >>> # Note: the following does not work in Python 3
55- >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
56- 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
57- >>>
58-
59-
60- """
61-
62- warnings .warn ("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead" ,
63- DeprecationWarning )
64-
65- str_base = to_bytes (base , encoding )
66- str_ref = to_bytes (ref , encoding )
67- return urljoin (str_base , str_ref )
68-
69- _reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
70- _unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
71- _safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
35+ _safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
7236
7337def safe_url_string (url , encoding = 'utf8' , path_encoding = 'utf8' ):
7438 """Convert the given URL into a legal URL by escaping unsafe characters
@@ -117,6 +81,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
11781 quote (to_bytes (parts .fragment , encoding ), _safe_chars ),
11882 ))
11983
84+
12085_parent_dirs = re .compile (r'/?(\.\./)+' )
12186
12287def safe_download_url (url ):
@@ -137,9 +102,11 @@ def safe_download_url(url):
137102 path = '/'
138103 return urlunsplit ((scheme , netloc , path , query , '' ))
139104
105+
140106def is_url (text ):
141107 return text .partition ("://" )[0 ] in ('file' , 'http' , 'https' )
142108
109+
143110def url_query_parameter (url , parameter , default = None , keep_blank_values = 0 ):
144111 """Return the value of a url parameter, given the url and parameter name
145112
@@ -175,6 +142,7 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
175142 )
176143 return queryparams .get (parameter , [default ])[0 ]
177144
145+
178146def url_query_cleaner (url , parameterlist = (), sep = '&' , kvsep = '=' , remove = False , unique = True , keep_fragments = False ):
179147 """Clean URL arguments leaving only those passed in the parameterlist keeping order
180148
@@ -229,6 +197,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
229197 url += '#' + fragment
230198 return url
231199
200+
232201def add_or_replace_parameter (url , name , new_value ):
233202 """Add or remove a parameter to a given url
234203
@@ -270,13 +239,15 @@ def path_to_file_uri(path):
270239 x = x .replace ('|' , ':' ) # http://bugs.python.org/issue5861
271240 return 'file:///%s' % x .lstrip ('/' )
272241
242+
273243def file_uri_to_path (uri ):
274244 """Convert File URI to local filesystem path according to:
275245 http://en.wikipedia.org/wiki/File_URI_scheme
276246 """
277247 uri_path = urlparse (uri ).path
278248 return url2pathname (uri_path )
279249
250+
280251def any_to_uri (uri_or_path ):
281252 """If given a path name, return its File URI, otherwise return it
282253 unmodified
@@ -584,3 +555,38 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
584555 value = _coerce_result (value )
585556 r .append ((name , value ))
586557 return r
558+
559+
560+ def urljoin_rfc (base , ref , encoding = 'utf-8' ):
561+ r"""
562+ .. warning::
563+
564+ This function is deprecated and will be removed in future.
565+ It is not supported with Python 3.
566+ Please use ``urlparse.urljoin`` instead.
567+
568+ Same as urlparse.urljoin but supports unicode values in base and ref
569+ parameters (in which case they will be converted to str using the given
570+ encoding).
571+
572+ Always returns a str.
573+
574+ >>> import w3lib.url
575+ >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
576+ 'http://www.example.com/otherpath/index2.html'
577+ >>>
578+
579+ >>> # Note: the following does not work in Python 3
580+ >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
581+ 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
582+ >>>
583+
584+
585+ """
586+
587+ warnings .warn ("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead" ,
588+ DeprecationWarning )
589+
590+ str_base = to_bytes (base , encoding )
591+ str_ref = to_bytes (ref , encoding )
592+ return urljoin (str_base , str_ref )
0 commit comments