Merge pull request #170 from Gallaecio/deprecate-python2-api

wRAR · web-flow · commit 0fd159095535 · 2021-07-26T16:56:51.000+05:00
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -0,0 +1,46 @@
+from unittest import TestCase
+
+from pytest import deprecated_call, raises
+
+from w3lib.util import (
+    str_to_unicode,
+    to_bytes,
+    to_native_str,
+    to_unicode,
+    unicode_to_str,
+)
+
+
+class StrToUnicodeTestCase(TestCase):
+
+    def test_deprecation(self):
+        with deprecated_call():
+            str_to_unicode('')
+
+
+class ToBytesTestCase(TestCase):
+
+    def test_type_error(self):
+        with raises(TypeError):
+            to_bytes(True)
+
+
+class ToNativeStrTestCase(TestCase):
+
+    def test_deprecation(self):
+        with deprecated_call():
+            to_native_str('')
+
+
+class ToUnicodeTestCase(TestCase):
+
+    def test_type_error(self):
+        with raises(TypeError):
+            to_unicode(True)
+
+
+class UnicodeToStrTestCase(TestCase):
+
+    def test_deprecation(self):
+        with deprecated_call():
+            unicode_to_str('')
diff --git a/w3lib/url.py b/w3lib/url.py
@@ -24,12 +24,12 @@
     urlunsplit,
 )
 from urllib.request import pathname2url, url2pathname
-from w3lib.util import to_bytes, to_native_str, to_unicode
+from w3lib.util import to_unicode
 
 
 # error handling function for bytes-to-Unicode decoding errors with URLs
 def _quote_byte(error):
-    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
+    return (quote(error.object[error.start:error.end]), error.end)
 
 codecs.register_error('percentencode', _quote_byte)
 
@@ -77,26 +77,22 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        netloc = parts.netloc.encode('idna').decode()
     except UnicodeError:
         netloc = parts.netloc
 
     # default encoding for path component SHOULD be UTF-8
     if quote_path:
-        path = quote(to_bytes(parts.path, path_encoding), _path_safe_chars)
+        path = quote(parts.path.encode(path_encoding), _path_safe_chars)
     else:
-        path = to_native_str(parts.path)
+        path = parts.path
     
-    # quote() in Python2 return type follows input type;
-    # quote() in Python3 always returns Unicode (native str)
     return urlunsplit((
-        to_native_str(parts.scheme),
-        to_native_str(netloc).rstrip(':'),
+        parts.scheme,
+        netloc.rstrip(':'),
         path,
-        # encoding of query and fragment follows page encoding
-        # or form-charset (if known and passed)
-        quote(to_bytes(parts.query, encoding), _safe_chars),
-        quote(to_bytes(parts.fragment, encoding), _safe_chars),
+        quote(parts.query.encode(encoding), _safe_chars),
+        quote(parts.fragment.encode(encoding), _safe_chars),
     ))
 
 
@@ -410,22 +406,17 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        netloc = parts.netloc.encode('idna').decode()
     except UnicodeError:
         netloc = parts.netloc
 
     return (
-        to_native_str(parts.scheme),
-        to_native_str(netloc),
-
-        # default encoding for path component SHOULD be UTF-8
-        quote(to_bytes(parts.path, path_encoding), _path_safe_chars),
-        quote(to_bytes(parts.params, path_encoding), _safe_chars),
-
-        # encoding of query and fragment follows page encoding
-        # or form-charset (if known and passed)
-        quote(to_bytes(parts.query, encoding), _safe_chars),
-        quote(to_bytes(parts.fragment, encoding), _safe_chars)
+        parts.scheme,
+        netloc,
+        quote(parts.path.encode(path_encoding), _path_safe_chars),
+        quote(parts.params.encode(path_encoding), _safe_chars),
+        quote(parts.query.encode(encoding), _safe_chars),
+        quote(parts.fragment.encode(encoding), _safe_chars)
     )
 
 
@@ -466,7 +457,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
     # if not for proper URL expected by remote website.
     try:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
-            parse_url(url), encoding=encoding)
+            parse_url(url), encoding=encoding or 'utf8')
     except UnicodeEncodeError as e:
         scheme, netloc, path, params, query, fragment = _safe_ParseResult(
             parse_url(url), encoding='utf8')
diff --git a/w3lib/util.py b/w3lib/util.py
@@ -1,11 +1,26 @@
+from warnings import warn
+
+
 def str_to_unicode(text, encoding=None, errors='strict'):
+    warn(
+        "The w3lib.utils.str_to_unicode function is deprecated and "
+        "will be removed in a future release.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     if encoding is None:
         encoding = 'utf-8'
     if isinstance(text, bytes):
         return text.decode(encoding, errors)
     return text
 
 def unicode_to_str(text, encoding=None, errors='strict'):
+    warn(
+        "The w3lib.utils.unicode_to_str function is deprecated and "
+        "will be removed in a future release.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     if encoding is None:
         encoding = 'utf-8'
     if isinstance(text, str):
@@ -18,8 +33,9 @@ def to_unicode(text, encoding=None, errors='strict'):
     if isinstance(text, str):
         return text
     if not isinstance(text, (bytes, str)):
-        raise TypeError('to_unicode must receive a bytes, str or unicode '
-                        'object, got %s' % type(text).__name__)
+        raise TypeError(
+            f'to_unicode must receive bytes or str, got {type(text).__name__}'
+        )
     if encoding is None:
         encoding = 'utf-8'
     return text.decode(encoding, errors)
@@ -30,12 +46,20 @@ def to_bytes(text, encoding=None, errors='strict'):
     if isinstance(text, bytes):
         return text
     if not isinstance(text, str):
-        raise TypeError('to_bytes must receive a unicode, str or bytes '
-                        'object, got %s' % type(text).__name__)
+        raise TypeError(
+            f'to_bytes must receive str or bytes, got {type(text).__name__}'
+        )
     if encoding is None:
         encoding = 'utf-8'
     return text.encode(encoding, errors)
 
 def to_native_str(text, encoding=None, errors='strict'):
     """ Return str representation of `text` """
+    warn(
+        "The w3lib.utils.to_native_str function is deprecated and "
+        "will be removed in a future release. Please use "
+        "w3lib.utils.to_unicode instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
     return to_unicode(text, encoding, errors)