Skip to content

Commit 9c687b0

Browse files
committed
[compat] make to_unicode more robust
1 parent 8a9a7a5 commit 9c687b0

File tree

2 files changed

+108
-4
lines changed

2 files changed

+108
-4
lines changed

ddtrace/compat.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,21 @@ def iteritems(obj, **kwargs):
3737

3838
def to_unicode(s):
3939
""" Return a unicode string for the given bytes or string instance. """
40-
if hasattr(s, "decode"):
41-
return s.decode("utf-8")
42-
else:
43-
return stringify(s)
40+
# No reason to decode if we already have the unicode compatible object we expect
41+
# DEV: `stringify` will be a `str` for python 3 and `unicode` for python 2
42+
# DEV: Double decoding a `unicode` can cause a `UnicodeEncodeError`
43+
# e.g. `'\xc3\xbf'.decode('utf-8').decode('utf-8')`
44+
if isinstance(s, stringify):
45+
return s
46+
47+
# If the object has a `decode` method, then decode into `utf-8`
48+
# e.g. Python 2 `str`, Python 2/3 `bytearray`, etc
49+
if hasattr(s, 'decode'):
50+
return s.decode('utf-8')
51+
52+
# Always try to coerce the object into the `stringify` object we expect
53+
# e.g. `to_unicode(1)`, `to_unicode(dict(key='value'))`
54+
return stringify(s)
4455

4556
if PY2:
4657
numeric_types = (int, long, float)

tests/test_compat.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# -*- coding: utf-8 -*-
2+
# Define source file encoding to support raw unicode characters in Python 2
3+
4+
# Third party
5+
from nose.tools import eq_
6+
7+
# Project
8+
from ddtrace.compat import to_unicode, PY2
9+
10+
11+
# Use different test suites for each Python version, this allows us to test the expected
12+
# results for each Python version rather than writing a generic "works for both" test suite
13+
if PY2:
14+
class TestCompatPY2(object):
15+
def test_to_unicode_string(self):
16+
""" Calling `compat.to_unicode` on a non-unicode string """
17+
res = to_unicode('test')
18+
eq_(type(res), unicode)
19+
eq_(res, 'test')
20+
21+
def test_to_unicode_unicode_encoded(self):
22+
""" Calling `compat.to_unicode` on a unicode encoded string """
23+
res = to_unicode('\xc3\xbf')
24+
eq_(type(res), unicode)
25+
eq_(res, u'ÿ')
26+
27+
def test_to_unicode_unicode_double_decode(self):
28+
""" Calling `compat.to_unicode` on a unicode decoded string """
29+
# This represents the double-decode issue, which can cause a `UnicodeEncodeError`
30+
# `'\xc3\xbf'.decode('utf-8').decode('utf-8')`
31+
res = to_unicode('\xc3\xbf'.decode('utf-8'))
32+
eq_(type(res), unicode)
33+
eq_(res, u'ÿ')
34+
35+
def test_to_unicode_unicode_string(self):
36+
""" Calling `compat.to_unicode` on a unicode string """
37+
res = to_unicode(u'ÿ')
38+
eq_(type(res), unicode)
39+
eq_(res, u'ÿ')
40+
41+
def test_to_unicode_bytearray(self):
42+
""" Calling `compat.to_unicode` with a `bytearray` containing unicode """
43+
res = to_unicode(bytearray('\xc3\xbf'))
44+
eq_(type(res), unicode)
45+
eq_(res, u'ÿ')
46+
47+
def test_to_unicode_bytearray_double_decode(self):
48+
""" Calling `compat.to_unicode` with an already decoded `bytearray` """
49+
# This represents the double-decode issue, which can cause a `UnicodeEncodeError`
50+
# `bytearray('\xc3\xbf').decode('utf-8').decode('utf-8')`
51+
res = to_unicode(bytearray('\xc3\xbf').decode('utf-8'))
52+
eq_(type(res), unicode)
53+
eq_(res, u'ÿ')
54+
55+
def test_to_unicode_non_string(self):
56+
""" Calling `compat.to_unicode` on non-string types """
57+
eq_(to_unicode(1), u'1')
58+
eq_(to_unicode(True), u'True')
59+
eq_(to_unicode(None), u'None')
60+
eq_(to_unicode(dict(key='value')), u'{\'key\': \'value\'}')
61+
62+
else:
63+
class TestCompatPY3(object):
64+
def test_to_unicode_string(self):
65+
""" Calling `compat.to_unicode` on a non-unicode string """
66+
res = to_unicode('test')
67+
eq_(type(res), str)
68+
eq_(res, 'test')
69+
70+
def test_to_unicode_unicode_encoded(self):
71+
""" Calling `compat.to_unicode` on a unicode encoded string """
72+
res = to_unicode('\xff')
73+
eq_(type(res), str)
74+
eq_(res, 'ÿ')
75+
76+
def test_to_unicode_unicode_string(self):
77+
""" Calling `compat.to_unicode` on a unicode string """
78+
res = to_unicode('ÿ')
79+
eq_(type(res), str)
80+
eq_(res, 'ÿ')
81+
82+
def test_to_unicode_bytearray(self):
83+
""" Calling `compat.to_unicode` with a `bytearray` containing unicode """
84+
res = to_unicode(bytearray('\xff', 'utf-8'))
85+
eq_(type(res), str)
86+
eq_(res, 'ÿ')
87+
88+
def test_to_unicode_non_string(self):
89+
""" Calling `compat.to_unicode` on non-string types """
90+
eq_(to_unicode(1), '1')
91+
eq_(to_unicode(True), 'True')
92+
eq_(to_unicode(None), 'None')
93+
eq_(to_unicode(dict(key='value')), '{\'key\': \'value\'}')

0 commit comments

Comments
 (0)