[compat] make to_unicode more robust

brettlangdon · brettlangdon · commit 9c687b0fc5a4 · 2016-10-31T11:08:53.000-04:00
diff --git a/ddtrace/compat.py b/ddtrace/compat.py
@@ -37,10 +37,21 @@ def iteritems(obj, **kwargs):
 
 def to_unicode(s):
     """ Return a unicode string for the given bytes or string instance. """
-    if hasattr(s, "decode"):
-        return s.decode("utf-8")
-    else:
-        return stringify(s)
+    # No reason to decode if we already have the unicode compatible object we expect
+    # DEV: `stringify` will be a `str` for python 3 and `unicode` for python 2
+    # DEV: Double decoding a `unicode` can cause a `UnicodeEncodeError`
+    #   e.g. `'\xc3\xbf'.decode('utf-8').decode('utf-8')`
+    if isinstance(s, stringify):
+        return s
+
+    # If the object has a `decode` method, then decode into `utf-8`
+    #   e.g. Python 2 `str`, Python 2/3 `bytearray`, etc
+    if hasattr(s, 'decode'):
+        return s.decode('utf-8')
+
+    # Always try to coerce the object into the `stringify` object we expect
+    #   e.g. `to_unicode(1)`, `to_unicode(dict(key='value'))`
+    return stringify(s)
 
 if PY2:
     numeric_types = (int, long, float)
diff --git a/tests/test_compat.py b/tests/test_compat.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+# Define source file encoding to support raw unicode characters in Python 2
+
+# Third party
+from nose.tools import eq_
+
+# Project
+from ddtrace.compat import to_unicode, PY2
+
+
+# Use different test suites for each Python version, this allows us to test the expected
+#   results for each Python version rather than writing a generic "works for both" test suite
+if PY2:
+    class TestCompatPY2(object):
+        def test_to_unicode_string(self):
+            """ Calling `compat.to_unicode` on a non-unicode string """
+            res = to_unicode('test')
+            eq_(type(res), unicode)
+            eq_(res, 'test')
+
+        def test_to_unicode_unicode_encoded(self):
+            """ Calling `compat.to_unicode` on a unicode encoded string """
+            res = to_unicode('\xc3\xbf')
+            eq_(type(res), unicode)
+            eq_(res, u'ÿ')
+
+        def test_to_unicode_unicode_double_decode(self):
+            """ Calling `compat.to_unicode` on a unicode decoded string """
+            # This represents the double-decode issue, which can cause a `UnicodeEncodeError`
+            #   `'\xc3\xbf'.decode('utf-8').decode('utf-8')`
+            res = to_unicode('\xc3\xbf'.decode('utf-8'))
+            eq_(type(res), unicode)
+            eq_(res, u'ÿ')
+
+        def test_to_unicode_unicode_string(self):
+            """ Calling `compat.to_unicode` on a unicode string """
+            res = to_unicode(u'ÿ')
+            eq_(type(res), unicode)
+            eq_(res, u'ÿ')
+
+        def test_to_unicode_bytearray(self):
+            """ Calling `compat.to_unicode` with a `bytearray` containing unicode """
+            res = to_unicode(bytearray('\xc3\xbf'))
+            eq_(type(res), unicode)
+            eq_(res, u'ÿ')
+
+        def test_to_unicode_bytearray_double_decode(self):
+            """ Calling `compat.to_unicode` with an already decoded `bytearray` """
+            # This represents the double-decode issue, which can cause a `UnicodeEncodeError`
+            #   `bytearray('\xc3\xbf').decode('utf-8').decode('utf-8')`
+            res = to_unicode(bytearray('\xc3\xbf').decode('utf-8'))
+            eq_(type(res), unicode)
+            eq_(res, u'ÿ')
+
+        def test_to_unicode_non_string(self):
+            """ Calling `compat.to_unicode` on non-string types """
+            eq_(to_unicode(1), u'1')
+            eq_(to_unicode(True), u'True')
+            eq_(to_unicode(None), u'None')
+            eq_(to_unicode(dict(key='value')), u'{\'key\': \'value\'}')
+
+else:
+    class TestCompatPY3(object):
+        def test_to_unicode_string(self):
+            """ Calling `compat.to_unicode` on a non-unicode string """
+            res = to_unicode('test')
+            eq_(type(res), str)
+            eq_(res, 'test')
+
+        def test_to_unicode_unicode_encoded(self):
+            """ Calling `compat.to_unicode` on a unicode encoded string """
+            res = to_unicode('\xff')
+            eq_(type(res), str)
+            eq_(res, 'ÿ')
+
+        def test_to_unicode_unicode_string(self):
+            """ Calling `compat.to_unicode` on a unicode string """
+            res = to_unicode('ÿ')
+            eq_(type(res), str)
+            eq_(res, 'ÿ')
+
+        def test_to_unicode_bytearray(self):
+            """ Calling `compat.to_unicode` with a `bytearray` containing unicode """
+            res = to_unicode(bytearray('\xff', 'utf-8'))
+            eq_(type(res), str)
+            eq_(res, 'ÿ')
+
+        def test_to_unicode_non_string(self):
+            """ Calling `compat.to_unicode` on non-string types """
+            eq_(to_unicode(1), '1')
+            eq_(to_unicode(True), 'True')
+            eq_(to_unicode(None), 'None')
+            eq_(to_unicode(dict(key='value')), '{\'key\': \'value\'}')