|
24 | 24 | urlunsplit, |
25 | 25 | ) |
26 | 26 | from urllib.request import pathname2url, url2pathname |
27 | | -from w3lib.util import to_bytes, to_native_str, to_unicode |
| 27 | +from w3lib.util import to_unicode |
28 | 28 |
|
29 | 29 |
|
30 | 30 | # error handling function for bytes-to-Unicode decoding errors with URLs |
31 | 31 | def _quote_byte(error): |
32 | | - return (to_unicode(quote(error.object[error.start:error.end])), error.end) |
| 32 | + return (quote(error.object[error.start:error.end]), error.end) |
33 | 33 |
|
34 | 34 | codecs.register_error('percentencode', _quote_byte) |
35 | 35 |
|
@@ -77,26 +77,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True) |
77 | 77 | # IDNA encoding can fail for too long labels (>63 characters) |
78 | 78 | # or missing labels (e.g. http://.example.com) |
79 | 79 | try: |
80 | | - netloc = parts.netloc.encode('idna') |
| 80 | + netloc = parts.netloc.encode('idna').decode() |
81 | 81 | except UnicodeError: |
82 | 82 | netloc = parts.netloc |
83 | 83 |
|
84 | 84 | # default encoding for path component SHOULD be UTF-8 |
85 | 85 | if quote_path: |
86 | | - path = quote(to_bytes(parts.path, path_encoding), _path_safe_chars) |
| 86 | + path = quote(parts.path.encode(path_encoding), _path_safe_chars) |
87 | 87 | else: |
88 | | - path = to_native_str(parts.path) |
| 88 | + path = parts.path |
89 | 89 |
|
90 | | - # quote() in Python2 return type follows input type; |
91 | | - # quote() in Python3 always returns Unicode (native str) |
92 | 90 | return urlunsplit(( |
93 | | - to_native_str(parts.scheme), |
94 | | - to_native_str(netloc).rstrip(':'), |
| 91 | + parts.scheme, |
| 92 | + netloc.rstrip(':'), |
95 | 93 | path, |
96 | | - # encoding of query and fragment follows page encoding |
97 | | - # or form-charset (if known and passed) |
98 | | - quote(to_bytes(parts.query, encoding), _safe_chars), |
99 | | - quote(to_bytes(parts.fragment, encoding), _safe_chars), |
| 94 | + quote(parts.query.encode(encoding), _safe_chars), |
| 95 | + quote(parts.fragment.encode(encoding), _safe_chars), |
100 | 96 | )) |
101 | 97 |
|
102 | 98 |
|
@@ -410,22 +406,17 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): |
410 | 406 | # IDNA encoding can fail for too long labels (>63 characters) |
411 | 407 | # or missing labels (e.g. http://.example.com) |
412 | 408 | try: |
413 | | - netloc = parts.netloc.encode('idna') |
| 409 | + netloc = parts.netloc.encode('idna').decode() |
414 | 410 | except UnicodeError: |
415 | 411 | netloc = parts.netloc |
416 | 412 |
|
417 | 413 | return ( |
418 | | - to_native_str(parts.scheme), |
419 | | - to_native_str(netloc), |
420 | | - |
421 | | - # default encoding for path component SHOULD be UTF-8 |
422 | | - quote(to_bytes(parts.path, path_encoding), _path_safe_chars), |
423 | | - quote(to_bytes(parts.params, path_encoding), _safe_chars), |
424 | | - |
425 | | - # encoding of query and fragment follows page encoding |
426 | | - # or form-charset (if known and passed) |
427 | | - quote(to_bytes(parts.query, encoding), _safe_chars), |
428 | | - quote(to_bytes(parts.fragment, encoding), _safe_chars) |
| 414 | + parts.scheme, |
| 415 | + netloc, |
| 416 | + quote(parts.path.encode(path_encoding), _path_safe_chars), |
| 417 | + quote(parts.params.encode(path_encoding), _safe_chars), |
| 418 | + quote(parts.query.encode(encoding), _safe_chars), |
| 419 | + quote(parts.fragment.encode(encoding), _safe_chars) |
429 | 420 | ) |
430 | 421 |
|
431 | 422 |
|
@@ -466,7 +457,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, |
466 | 457 | # if not for proper URL expected by remote website. |
467 | 458 | try: |
468 | 459 | scheme, netloc, path, params, query, fragment = _safe_ParseResult( |
469 | | - parse_url(url), encoding=encoding) |
| 460 | + parse_url(url), encoding=encoding or 'utf8') |
470 | 461 | except UnicodeEncodeError as e: |
471 | 462 | scheme, netloc, path, params, query, fragment = _safe_ParseResult( |
472 | 463 | parse_url(url), encoding='utf8') |
|
0 commit comments