Merge pull request #25 from kmike/rfc3986

dangra · web-flow · commit 63f391b9bec8 · 2017-06-16T13:02:14.000-03:00
[MRG] use constants from RFC3986
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -59,6 +59,10 @@ def test_safe_url_string(self):
 
         self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str))
 
+    def test_safe_url_string_unsafe_chars(self):
+        safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
+        self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")
+
     def test_safe_url_string_with_query(self):
         safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
         self.assertTrue(isinstance(safeurl, str))
diff --git a/tox.ini b/tox.ini
@@ -8,7 +8,7 @@ envlist = py27, pypy, py33, py34, py35, py36
 
 [testenv]
 deps =
-    pytest
+    pytest !=3.1.1, !=3.1.2
     pytest-cov
 commands =
     py.test \
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -8,8 +8,9 @@
 import re
 import posixpath
 import warnings
-import six
+import string
 from collections import namedtuple
+import six
 from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
                                     urldefrag, urlencode, urlparse,
                                     quote, parse_qs, parse_qsl,
@@ -24,51 +25,14 @@ def _quote_byte(error):
 
 codecs.register_error('percentencode', _quote_byte)
 
+# constants from RFC 3986, Section 2.2 and 2.3
+RFC3986_GEN_DELIMS = b':/?#[]@'
+RFC3986_SUB_DELIMS = b"!$&'()*+,;="
+RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
+RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
+EXTRA_SAFE_CHARS = b'|'  # see https://github.com/scrapy/w3lib/pull/25
 
-# Python 2.x urllib.always_safe become private in Python 3.x;
-# its content is copied here
-_ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-                      b'abcdefghijklmnopqrstuvwxyz'
-                      b'0123456789' b'_.-')
-
-
-def urljoin_rfc(base, ref, encoding='utf-8'):
-    r"""
-    .. warning::
-
-        This function is deprecated and will be removed in future.
-        It is not supported with Python 3.
-        Please use ``urlparse.urljoin`` instead.
-
-    Same as urlparse.urljoin but supports unicode values in base and ref
-    parameters (in which case they will be converted to str using the given
-    encoding).
-
-    Always returns a str.
-
-    >>> import w3lib.url
-    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
-    'http://www.example.com/otherpath/index2.html'
-    >>>
-
-    >>> # Note: the following does not work in Python 3
-    >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
-    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
-    >>>
-
-
-    """
-
-    warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
-        DeprecationWarning)
-
-    str_base = to_bytes(base, encoding)
-    str_ref = to_bytes(ref, encoding)
-    return urljoin(str_base, str_ref)
-
-_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
-_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
-_safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
+_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
 
 def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     """Convert the given URL into a legal URL by escaping unsafe characters
@@ -117,6 +81,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
         quote(to_bytes(parts.fragment, encoding), _safe_chars),
     ))
 
+
 _parent_dirs = re.compile(r'/?(\.\./)+')
 
 def safe_download_url(url):
@@ -137,9 +102,11 @@ def safe_download_url(url):
         path = '/'
     return urlunsplit((scheme, netloc, path, query, ''))
 
+
 def is_url(text):
     return text.partition("://")[0] in ('file', 'http', 'https')
 
+
 def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
     """Return the value of a url parameter, given the url and parameter name
 
@@ -175,6 +142,7 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
     )
     return queryparams.get(parameter, [default])[0]
 
+
 def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
     """Clean URL arguments leaving only those passed in the parameterlist keeping order
 
@@ -229,6 +197,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
         url += '#' + fragment
     return url
 
+
 def add_or_replace_parameter(url, name, new_value):
     """Add or remove a parameter to a given url
 
@@ -270,13 +239,15 @@ def path_to_file_uri(path):
         x = x.replace('|', ':') # http://bugs.python.org/issue5861
     return 'file:///%s' % x.lstrip('/')
 
+
 def file_uri_to_path(uri):
     """Convert File URI to local filesystem path according to:
     http://en.wikipedia.org/wiki/File_URI_scheme
     """
     uri_path = urlparse(uri).path
     return url2pathname(uri_path)
 
+
 def any_to_uri(uri_or_path):
     """If given a path name, return its File URI, otherwise return it
     unmodified
@@ -584,3 +555,38 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
                 value = _coerce_result(value)
                 r.append((name, value))
         return r
+
+
+def urljoin_rfc(base, ref, encoding='utf-8'):
+    r"""
+    .. warning::
+
+        This function is deprecated and will be removed in future.
+        It is not supported with Python 3.
+        Please use ``urlparse.urljoin`` instead.
+
+    Same as urlparse.urljoin but supports unicode values in base and ref
+    parameters (in which case they will be converted to str using the given
+    encoding).
+
+    Always returns a str.
+
+    >>> import w3lib.url
+    >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
+    'http://www.example.com/otherpath/index2.html'
+    >>>
+
+    >>> # Note: the following does not work in Python 3
+    >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
+    'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
+    >>>
+
+
+    """
+
+    warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
+        DeprecationWarning)
+
+    str_base = to_bytes(base, encoding)
+    str_ref = to_bytes(ref, encoding)
+    return urljoin(str_base, str_ref)