Skip to content

Commit 4eb7db7

Browse files
committed
Handle bytes URL with wrong encoding by percent-encoding offending bytes
1 parent c16c682 commit 4eb7db7

File tree

2 files changed

+31
-11
lines changed

2 files changed

+31
-11
lines changed

tests/test_url.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,19 @@ def test_safe_url_string_bytes_input(self):
106106
self.assertTrue(isinstance(safeurl, str))
107107
self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
108108

109+
def test_safe_url_string_bytes_input_nonutf8(self):
110+
# latin1
111+
safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
112+
self.assertTrue(isinstance(safeurl, str))
113+
self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
114+
115+
# cp1251
116+
# >>> u'Россия'.encode('cp1251')
117+
# '\xd0\xee\xf1\xf1\xe8\xff'
118+
safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
119+
self.assertTrue(isinstance(safeurl, str))
120+
self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")
121+
109122
def test_safe_url_idna(self):
110123
# adapted from:
111124
# https://ssl.icu-project.org/icu-bin/idnbrowser

w3lib/url.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
This module contains general purpose URL functions not found in the standard
33
library.
44
"""
5+
import codecs
56
import os
67
import re
78
import posixpath
@@ -13,6 +14,14 @@
1314
from six.moves.urllib.request import pathname2url, url2pathname
1415
from w3lib.util import to_bytes, to_native_str, to_unicode
1516

17+
18+
# error handling function for bytes-to-Unicode decoding errors with URLs
19+
def _quote_byte(error):
20+
return (to_unicode(quote(error.object[error.start:error.end])), error.end)
21+
22+
codecs.register_error('percentencode', _quote_byte)
23+
24+
1625
# Python 2.x urllib.always_safe become private in Python 3.x;
1726
# its content is copied here
1827
_ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@@ -64,24 +73,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
6473
encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
6574
URL path component (unless overriden by path_encoding), and given
6675
encoding is used for query string or form data.
67-
When passing a encoding, you should use the encoding of the
68-
original page (the page from which the url was extracted from).
76+
When passing an encoding, you should use the encoding of the
77+
original page (the page from which the URL was extracted from).
6978
7079
Calling this function on an already "safe" URL will return the URL
7180
unmodified.
7281
7382
Always returns a native `str` (bytes in Python2, unicode in Python3).
7483
"""
7584
# Python3's urlsplit() chokes on bytes input with non-ASCII chars,
76-
# so let's decode (to Unicode) using page encoding.
77-
#
78-
# it is assumed that a raw bytes input comes from the page
79-
# corresponding to the encoding
80-
#
81-
# Note: if this assumption is wrong, this will fail;
82-
# in the general case, users are required to use Unicode
83-
# or safe ASCII bytes input
84-
parts = urlsplit(to_unicode(url, encoding=encoding))
85+
# so let's decode (to Unicode) using page encoding:
86+
# - it is assumed that a raw bytes input comes from a document
87+
# encoded with the supplied encoding (or UTF8 by default)
88+
# - if the supplied (or default) encoding chokes,
89+
# percent-encode offending bytes
90+
parts = urlsplit(to_unicode(url, encoding=encoding,
91+
errors='percentencode'))
8592

8693
# quote() in Python2 return type follows input type;
8794
# quote() in Python3 always returns Unicode (native str)

0 commit comments

Comments
 (0)