Skip to content

Commit 821dfe5

Browse files
LaerteGallaecio
andauthored
Encode hostname directly instead of netloc (#174)
* Encode hostname directly instead netloc * simplify logic * Attend PR comments * we don't need else * Add more tests, encode username and password properly * run black * empty password test * safe_url_string: encode | and % in userinfo Co-authored-by: Adrián Chaves <[email protected]>
1 parent d9763db commit 821dfe5

File tree

2 files changed

+71
-9
lines changed

2 files changed

+71
-9
lines changed

tests/test_url.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,47 @@ def test_safe_url_string_preserve_nonfragment_hash(self):
303303
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag",
304304
)
305305

306+
def test_safe_url_string_encode_idna_domain_with_port(self):
307+
self.assertEqual(
308+
safe_url_string("http://新华网.中国:80"), "http://xn--xkrr14bows.xn--fiqs8s:80"
309+
)
310+
311+
def test_safe_url_string_encode_idna_domain_with_username_password_and_port_number(
312+
self,
313+
):
314+
self.assertEqual(
315+
safe_url_string("ftp://admin:admin@新华网.中国:21"),
316+
"ftp://admin:[email protected]:21",
317+
)
318+
self.assertEqual(
319+
safe_url_string("http://Åsa:abc123@➡.ws:81/admin"),
320+
"http://%C3%85sa:[email protected]:81/admin",
321+
)
322+
self.assertEqual(
323+
safe_url_string("http://japão:não@️i❤️.ws:8000/"),
324+
"http://jap%C3%A3o:n%C3%[email protected]:8000/",
325+
)
326+
327+
def test_safe_url_string_encode_idna_domain_with_username_and_empty_password_and_port_number(
328+
self,
329+
):
330+
self.assertEqual(
331+
safe_url_string("ftp://admin:@新华网.中国:21"),
332+
"ftp://admin:@xn--xkrr14bows.xn--fiqs8s:21",
333+
)
334+
self.assertEqual(
335+
safe_url_string("ftp://admin@新华网.中国:21"),
336+
"ftp://[email protected]:21",
337+
)
338+
339+
def test_safe_url_string_userinfo_unsafe_chars(
340+
self,
341+
):
342+
self.assertEqual(
343+
safe_url_string("ftp://admin:|%@example.com"),
344+
"ftp://admin:%7C%[email protected]",
345+
)
346+
306347
def test_safe_download_url(self):
307348
self.assertEqual(
308349
safe_download_url("http://www.example.org"), "http://www.example.org/"

w3lib/url.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
5555

5656
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b"%"
5757
_path_safe_chars = _safe_chars.replace(b"#", b"")
58+
RFC3986_USERINFO_SAFE_CHARS = RFC3986_UNRESERVED + RFC3986_SUB_DELIMS + b":"
5859

5960
_ascii_tab_newline_re = re.compile(
6061
r"[\t\n\r]"
@@ -95,14 +96,34 @@ def safe_url_string(
9596
decoded = to_unicode(url, encoding=encoding, errors="percentencode")
9697
parts = urlsplit(_ascii_tab_newline_re.sub("", decoded))
9798

98-
# IDNA encoding can fail for too long labels (>63 characters)
99-
# or missing labels (e.g. http://.example.com)
100-
try:
101-
netloc_bytes = parts.netloc.encode("idna")
102-
except UnicodeError:
103-
netloc = parts.netloc
104-
else:
105-
netloc = netloc_bytes.decode()
99+
username, password, hostname, port = (
100+
parts.username,
101+
parts.password,
102+
parts.hostname,
103+
parts.port,
104+
)
105+
netloc_bytes = b""
106+
if username is not None or password is not None:
107+
if username is not None:
108+
safe_username = quote(username, RFC3986_USERINFO_SAFE_CHARS)
109+
netloc_bytes += safe_username.encode(encoding)
110+
if password is not None:
111+
netloc_bytes += b":"
112+
safe_password = quote(password, RFC3986_USERINFO_SAFE_CHARS)
113+
netloc_bytes += safe_password.encode(encoding)
114+
netloc_bytes += b"@"
115+
if hostname is not None:
116+
try:
117+
netloc_bytes += hostname.encode("idna")
118+
except UnicodeError:
119+
# IDNA encoding can fail for too long labels (>63 characters) or
120+
# missing labels (e.g. http://.example.com)
121+
netloc_bytes += hostname.encode(encoding)
122+
if port is not None:
123+
netloc_bytes += b":"
124+
netloc_bytes += str(port).encode(encoding)
125+
126+
netloc = netloc_bytes.decode()
106127

107128
# default encoding for path component SHOULD be UTF-8
108129
if quote_path:
@@ -113,7 +134,7 @@ def safe_url_string(
113134
return urlunsplit(
114135
(
115136
parts.scheme,
116-
netloc.rstrip(":"),
137+
netloc,
117138
path,
118139
quote(parts.query.encode(encoding), _safe_chars),
119140
quote(parts.fragment.encode(encoding), _safe_chars),

0 commit comments

Comments
 (0)