Skip to content

Commit 0b891dc

Browse files
maramsumanthGallaecio
authored andcommitted
safe_url_string: make path quoting optional (#119)
1 parent 468a569 commit 0b891dc

File tree

2 files changed

+25
-8
lines changed

2 files changed

+25
-8
lines changed

tests/test_url.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,17 @@ def test_safe_url_string_remove_ascii_tab_and_newlines(self):
7676
def test_safe_url_string_unsafe_chars(self):
7777
safeurl = safe_url_string(r"http://localhost:8001/unwise{,},|,\,^,[,],`?|=[]&[]=|")
7878
self.assertEqual(safeurl, r"http://localhost:8001/unwise%7B,%7D,|,%5C,%5E,[,],%60?|=[]&[]=|")
79+
80+
def test_safe_url_string_quote_path(self):
81+
safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=True)
82+
self.assertEqual(safeurl, u'http://google.com/%22hello%22')
83+
84+
safeurl = safe_url_string(u'http://google.com/"hello"', quote_path=False)
85+
self.assertEqual(safeurl, u'http://google.com/"hello"')
86+
87+
safeurl = safe_url_string(u'http://google.com/"hello"')
88+
self.assertEqual(safeurl, u'http://google.com/%22hello%22')
89+
7990

8091
def test_safe_url_string_with_query(self):
8192
safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")

w3lib/url.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,18 @@ def _quote_byte(error):
3636

3737
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
3838

39-
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
39+
def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True):
4040
"""Convert the given URL into a legal URL by escaping unsafe characters
4141
according to RFC-3986. Also, ASCII tabs and newlines are removed
4242
as per https://url.spec.whatwg.org/#url-parsing.
4343
4444
If a bytes URL is given, it is first converted to `str` using the given
45-
encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
46-
URL path component (unless overriden by path_encoding), and given
47-
encoding is used for query string or form data.
45+
encoding (which defaults to 'utf-8'). If quote_path is True (default),
46+
path_encoding ('utf-8' by default) is used to encode URL path component
47+
which is then quoted. Otherwise, if quote_path is False, path component
48+
is not encoded or quoted. Given encoding is used for query string
49+
or form data.
50+
4851
When passing an encoding, you should use the encoding of the
4952
original page (the page from which the URL was extracted from).
5053
@@ -69,15 +72,18 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
6972
except UnicodeError:
7073
netloc = parts.netloc
7174

75+
# default encoding for path component SHOULD be UTF-8
76+
if quote_path:
77+
path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
78+
else:
79+
path = to_native_str(parts.path)
80+
7281
# quote() in Python2 return type follows input type;
7382
# quote() in Python3 always returns Unicode (native str)
7483
return urlunsplit((
7584
to_native_str(parts.scheme),
7685
to_native_str(netloc).rstrip(':'),
77-
78-
# default encoding for path component SHOULD be UTF-8
79-
quote(to_bytes(parts.path, path_encoding), _safe_chars),
80-
86+
path,
8187
# encoding of query and fragment follows page encoding
8288
# or form-charset (if known and passed)
8389
quote(to_bytes(parts.query, encoding), _safe_chars),

0 commit comments

Comments
 (0)