|
2 | 2 | This module contains general purpose URL functions not found in the standard |
3 | 3 | library. |
4 | 4 | """ |
| 5 | +import codecs |
5 | 6 | import os |
6 | 7 | import re |
7 | 8 | import posixpath |
|
13 | 14 | from six.moves.urllib.request import pathname2url, url2pathname |
14 | 15 | from w3lib.util import to_bytes, to_native_str, to_unicode |
15 | 16 |
|
| 17 | + |
| 18 | +# error handling function for bytes-to-Unicode decoding errors with URLs |
| 19 | +def _quote_byte(error): |
| 20 | + return (to_unicode(quote(error.object[error.start:error.end])), error.end) |
| 21 | + |
| 22 | +codecs.register_error('percentencode', _quote_byte) |
| 23 | + |
| 24 | + |
16 | 25 | # Python 2.x urllib.always_safe become private in Python 3.x; |
17 | 26 | # its content is copied here |
18 | 27 | _ALWAYS_SAFE_BYTES = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
@@ -64,24 +73,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): |
64 | 73 | encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for |
65 | 74 | URL path component (unless overriden by path_encoding), and given |
66 | 75 | encoding is used for query string or form data. |
67 | | - When passing a encoding, you should use the encoding of the |
68 | | - original page (the page from which the url was extracted from). |
| 76 | + When passing an encoding, you should use the encoding of the |
| 77 | + original page (the page from which the URL was extracted from). |
69 | 78 |
|
70 | 79 | Calling this function on an already "safe" URL will return the URL |
71 | 80 | unmodified. |
72 | 81 |
|
73 | 82 | Always returns a native `str` (bytes in Python2, unicode in Python3). |
74 | 83 | """ |
75 | 84 | # Python3's urlsplit() chokes on bytes input with non-ASCII chars, |
76 | | - # so let's decode (to Unicode) using page encoding. |
77 | | - # |
78 | | - # it is assumed that a raw bytes input comes from the page |
79 | | - # corresponding to the encoding |
80 | | - # |
81 | | - # Note: if this assumption is wrong, this will fail; |
82 | | - # in the general case, users are required to use Unicode |
83 | | - # or safe ASCII bytes input |
84 | | - parts = urlsplit(to_unicode(url, encoding=encoding)) |
| 85 | + # so let's decode (to Unicode) using page encoding: |
| 86 | + # - it is assumed that a raw bytes input comes from a document |
| 87 | + # encoded with the supplied encoding (or UTF8 by default) |
| 88 | + # - if the supplied (or default) encoding chokes, |
| 89 | + # percent-encode offending bytes |
| 90 | + parts = urlsplit(to_unicode(url, encoding=encoding, |
| 91 | + errors='percentencode')) |
85 | 92 |
|
86 | 93 | # quote() in Python2 return type follows input type; |
87 | 94 | # quote() in Python3 always returns Unicode (native str) |
|
0 commit comments