Handle bytes URL with wrong encoding by percent-encoding offending bytes

redapple · redapple · commit 4eb7db7effc4 · 2016-04-07T02:00:25.000+02:00
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -106,6 +106,19 @@ def test_safe_url_string_bytes_input(self):
         self.assertTrue(isinstance(safeurl, str))
         self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")
 
+    def test_safe_url_string_bytes_input_nonutf8(self):
+        # latin1
+        safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")
+
+        # cp1251
+        # >>> u'Россия'.encode('cp1251')
+        # '\xd0\xee\xf1\xf1\xe8\xff'
+        safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
+        self.assertTrue(isinstance(safeurl, str))
+        self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")
+
     def test_safe_url_idna(self):
         # adapted from:
         # https://ssl.icu-project.org/icu-bin/idnbrowser
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -2,6 +2,7 @@
 This module contains general purpose URL functions not found in the standard
 library.
 """
+import codecs
 import os
 import re
 import posixpath
@@ -13,6 +14,14 @@
 from six.moves.urllib.request import pathname2url, url2pathname
 from w3lib.util import to_bytes, to_native_str, to_unicode
 
+
+# error handling function for bytes-to-Unicode decoding errors with URLs
+def _quote_byte(error):
+    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
+
+codecs.register_error('percentencode', _quote_byte)
+
+
 # Python 2.x urllib.always_safe become private in Python 3.x;
 # its content is copied here
 _ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@@ -64,24 +73,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
     encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
     URL path component (unless overriden by path_encoding), and given
     encoding is used for query string or form data.
-    When passing a encoding, you should use the encoding of the
-    original page (the page from which the url was extracted from).
+    When passing an encoding, you should use the encoding of the
+    original page (the page from which the URL was extracted from).
 
     Calling this function on an already "safe" URL will return the URL
     unmodified.
 
     Always returns a native `str` (bytes in Python2, unicode in Python3).
     """
     # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
-    # so let's decode (to Unicode) using page encoding.
-    #
-    # it is assumed that a raw bytes input comes from the page
-    # corresponding to the encoding
-    #
-    # Note: if this assumption is wrong, this will fail;
-    #       in the general case, users are required to use Unicode
-    #       or safe ASCII bytes input
-    parts = urlsplit(to_unicode(url, encoding=encoding))
+    # so let's decode (to Unicode) using page encoding:
+    #   - it is assumed that a raw bytes input comes from a document
+    #     encoded with the supplied encoding (or UTF8 by default)
+    #   - if the supplied (or default) encoding chokes,
+    #     percent-encode offending bytes
+    parts = urlsplit(to_unicode(url, encoding=encoding,
+                                errors='percentencode'))
 
     # quote() in Python2 return type follows input type;
     # quote() in Python3 always returns Unicode (native str)