Skip to content

Commit 63f391b

Browse files
authored
Merge pull request #25 from kmike/rfc3986
[MRG] use constants from RFC3986
2 parents 1095a42 + b92a222 commit 63f391b

File tree

3 files changed

+56
-46
lines changed

3 files changed

+56
-46
lines changed

tests/test_url.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ def test_safe_url_string(self):
5959

6060
self.assertTrue(isinstance(safe_url_string(b'http://example.com/'), str))
6161

62+
def test_safe_url_string_unsafe_chars(self):
63+
safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
64+
self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")
65+
6266
def test_safe_url_string_with_query(self):
6367
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
6468
self.assertTrue(isinstance(safeurl, str))

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ envlist = py27, pypy, py33, py34, py35, py36
88

99
[testenv]
1010
deps =
11-
pytest
11+
pytest !=3.1.1, !=3.1.2
1212
pytest-cov
1313
commands =
1414
py.test \

w3lib/url.py

Lines changed: 51 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
import re
99
import posixpath
1010
import warnings
11-
import six
11+
import string
1212
from collections import namedtuple
13+
import six
1314
from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit,
1415
urldefrag, urlencode, urlparse,
1516
quote, parse_qs, parse_qsl,
@@ -24,51 +25,14 @@ def _quote_byte(error):
2425

2526
codecs.register_error('percentencode', _quote_byte)
2627

28+
# constants from RFC 3986, Section 2.2 and 2.3
29+
RFC3986_GEN_DELIMS = b':/?#[]@'
30+
RFC3986_SUB_DELIMS = b"!$&'()*+,;="
31+
RFC3986_RESERVED = RFC3986_GEN_DELIMS + RFC3986_SUB_DELIMS
32+
RFC3986_UNRESERVED = (string.ascii_letters + string.digits + "-._~").encode('ascii')
33+
EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25
2734

28-
# Python 2.x urllib.always_safe become private in Python 3.x;
29-
# its content is copied here
30-
_ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
31-
b'abcdefghijklmnopqrstuvwxyz'
32-
b'0123456789' b'_.-')
33-
34-
35-
def urljoin_rfc(base, ref, encoding='utf-8'):
36-
r"""
37-
.. warning::
38-
39-
This function is deprecated and will be removed in future.
40-
It is not supported with Python 3.
41-
Please use ``urlparse.urljoin`` instead.
42-
43-
Same as urlparse.urljoin but supports unicode values in base and ref
44-
parameters (in which case they will be converted to str using the given
45-
encoding).
46-
47-
Always returns a str.
48-
49-
>>> import w3lib.url
50-
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
51-
'http://www.example.com/otherpath/index2.html'
52-
>>>
53-
54-
>>> # Note: the following does not work in Python 3
55-
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
56-
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
57-
>>>
58-
59-
60-
"""
61-
62-
warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
63-
DeprecationWarning)
64-
65-
str_base = to_bytes(base, encoding)
66-
str_ref = to_bytes(ref, encoding)
67-
return urljoin(str_base, str_ref)
68-
69-
_reserved = b';/?:@&=+$|,#' # RFC 3986 (Generic Syntax)
70-
_unreserved_marks = b"-_.!~*'()" # RFC 3986 sec 2.3
71-
_safe_chars = _ALWAYS_SAFE_BYTES + b'%' + _reserved + _unreserved_marks
35+
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
7236

7337
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
7438
"""Convert the given URL into a legal URL by escaping unsafe characters
@@ -117,6 +81,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
11781
quote(to_bytes(parts.fragment, encoding), _safe_chars),
11882
))
11983

84+
12085
_parent_dirs = re.compile(r'/?(\.\./)+')
12186

12287
def safe_download_url(url):
@@ -137,9 +102,11 @@ def safe_download_url(url):
137102
path = '/'
138103
return urlunsplit((scheme, netloc, path, query, ''))
139104

105+
140106
def is_url(text):
141107
return text.partition("://")[0] in ('file', 'http', 'https')
142108

109+
143110
def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
144111
"""Return the value of a url parameter, given the url and parameter name
145112
@@ -175,6 +142,7 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0):
175142
)
176143
return queryparams.get(parameter, [default])[0]
177144

145+
178146
def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, unique=True, keep_fragments=False):
179147
"""Clean URL arguments leaving only those passed in the parameterlist keeping order
180148
@@ -229,6 +197,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u
229197
url += '#' + fragment
230198
return url
231199

200+
232201
def add_or_replace_parameter(url, name, new_value):
233202
"""Add or remove a parameter to a given url
234203
@@ -270,13 +239,15 @@ def path_to_file_uri(path):
270239
x = x.replace('|', ':') # http://bugs.python.org/issue5861
271240
return 'file:///%s' % x.lstrip('/')
272241

242+
273243
def file_uri_to_path(uri):
274244
"""Convert File URI to local filesystem path according to:
275245
http://en.wikipedia.org/wiki/File_URI_scheme
276246
"""
277247
uri_path = urlparse(uri).path
278248
return url2pathname(uri_path)
279249

250+
280251
def any_to_uri(uri_or_path):
281252
"""If given a path name, return its File URI, otherwise return it
282253
unmodified
@@ -584,3 +555,38 @@ def parse_qsl_to_bytes(qs, keep_blank_values=False):
584555
value = _coerce_result(value)
585556
r.append((name, value))
586557
return r
558+
559+
560+
def urljoin_rfc(base, ref, encoding='utf-8'):
561+
r"""
562+
.. warning::
563+
564+
This function is deprecated and will be removed in future.
565+
It is not supported with Python 3.
566+
Please use ``urlparse.urljoin`` instead.
567+
568+
Same as urlparse.urljoin but supports unicode values in base and ref
569+
parameters (in which case they will be converted to str using the given
570+
encoding).
571+
572+
Always returns a str.
573+
574+
>>> import w3lib.url
575+
>>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html')
576+
'http://www.example.com/otherpath/index2.html'
577+
>>>
578+
579+
>>> # Note: the following does not work in Python 3
580+
>>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP
581+
'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm'
582+
>>>
583+
584+
585+
"""
586+
587+
warnings.warn("w3lib.url.urljoin_rfc is deprecated, use urlparse.urljoin instead",
588+
DeprecationWarning)
589+
590+
str_base = to_bytes(base, encoding)
591+
str_ref = to_bytes(ref, encoding)
592+
return urljoin(str_base, str_ref)

0 commit comments

Comments
 (0)