Skip to content

Commit 3e29700

Browse files
authored
Merge pull request #59 from jcrobak/codecs-fix
Fix the utf-8 converted type issue
2 parents d1c2eed + e138edb commit 3e29700

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

parquet/converted_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def convert_column(data, schemae):
7373
elif ctype == parquet_thrift.ConvertedType.TIMESTAMP_MILLIS:
7474
return [datetime.datetime.utcfromtimestamp(d / 1000.0) for d in data]
7575
elif ctype == parquet_thrift.ConvertedType.UTF8:
76-
return list(codecs.iterdecode(data, "utf-8"))
76+
return [codecs.decode(item, "utf-8") for item in data]
7777
elif ctype == parquet_thrift.ConvertedType.UINT_8:
7878
return _convert_unsigned(data, 'b')
7979
elif ctype == parquet_thrift.ConvertedType.UINT_16:

test/test_converted_types.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,19 @@ def test_utf8(self):
134134
'foo👾'
135135
)
136136

137+
def test_utf8_empty_string(self):
138+
"""Test bytes representing utf-8 string with empty strings."""
139+
schema = pt.SchemaElement(
140+
type=pt.Type.BYTE_ARRAY,
141+
name="test",
142+
converted_type=pt.ConvertedType.UTF8
143+
)
144+
data = [b'', b'foo\xf0\x9f\x91\xbe', b'']
145+
self.assertEqual(
146+
convert_column(data, schema),
147+
['', 'foo👾', '']
148+
)
149+
137150
def test_json(self):
138151
"""Test bytes representing json."""
139152
schema = pt.SchemaElement(

0 commit comments

Comments
 (0)