Skip to content

Commit 398d222

Browse files
authored
ARROW-158 Fix crash when performing auto schema detection on embedded lists and subdocuments (#149)
1 parent f1702b3 commit 398d222

File tree

2 files changed

+80
-8
lines changed

2 files changed

+80
-8
lines changed

bindings/python/pymongoarrow/lib.pyx

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ _field_type_map = {
8383
}
8484

8585
cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_type_t value_t, context):
86-
"""Get the appropropriate data type for a specific field"""
86+
"""Get the appropriate data type for a specific field"""
8787
cdef const uint8_t *val_buf = NULL
8888
cdef uint32_t val_buf_len = 0
8989
cdef bson_subtype_t subtype
@@ -102,13 +102,15 @@ cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_
102102
elif value_t == BSON_TYPE_BINARY:
103103
bson_iter_binary (doc_iter, &subtype, &val_buf_len, &val_buf)
104104
field_type = BinaryType(subtype)
105+
elif value_t == BSON_TYPE_NULL:
106+
field_type = None
105107
else:
106108
raise PyMongoArrowError('unknown value type {}'.format(value_t))
107109
return field_type
108110

109111

110112
cdef extract_document_dtype(bson_iter_t * doc_iter, context):
111-
"""Get the appropropriate data type for a sub document"""
113+
"""Get the appropriate data type for a sub document"""
112114
cdef const char* key
113115
cdef bson_type_t value_t
114116
cdef bson_iter_t child_iter
@@ -117,18 +119,24 @@ cdef extract_document_dtype(bson_iter_t * doc_iter, context):
117119
key = bson_iter_key(doc_iter)
118120
value_t = bson_iter_type(doc_iter)
119121
field_type = extract_field_dtype(doc_iter, &child_iter, value_t, context)
120-
fields.append(field(key.decode('utf-8'), field_type))
121-
return struct(fields)
122+
if field_type is not None:
123+
fields.append(field(key.decode('utf-8'), field_type))
124+
if fields:
125+
return struct(fields)
126+
return None
122127

123128
cdef extract_array_dtype(bson_iter_t * doc_iter, context):
124-
"""Get the appropropriate data type for a sub array"""
129+
"""Get the appropriate data type for a sub array"""
125130
cdef const char* key
126131
cdef bson_type_t value_t
127132
cdef bson_iter_t child_iter
128133
fields = []
129-
first_item = bson_iter_next(doc_iter)
130-
value_t = bson_iter_type(doc_iter)
131-
return extract_field_dtype(doc_iter, &child_iter, value_t, context)
134+
while bson_iter_next(doc_iter):
135+
value_t = bson_iter_type(doc_iter)
136+
field_type = extract_field_dtype(doc_iter, &child_iter, value_t, context)
137+
if field_type is not None:
138+
return field_type
139+
return None
132140

133141
def process_bson_stream(bson_stream, context, arr_value_builder=None):
134142
"""Process a bson byte stream using a PyMongoArrowContext"""
@@ -198,10 +206,14 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
198206
elif builder_type == DocumentBuilder:
199207
bson_iter_recurse(&doc_iter, &child_iter)
200208
struct_dtype = extract_document_dtype(&child_iter, context)
209+
if struct_dtype is None:
210+
continue
201211
builder = DocumentBuilder(struct_dtype, context.tzinfo)
202212
elif builder_type == ListBuilder:
203213
bson_iter_recurse(&doc_iter, &child_iter)
204214
list_dtype = extract_array_dtype(&child_iter, context)
215+
if list_dtype is None:
216+
continue
205217
list_dtype = list_(list_dtype)
206218
builder = ListBuilder(list_dtype, context.tzinfo, value_builder=arr_value_builder)
207219
elif builder_type == BinaryBuilder:

bindings/python/test/test_arrow.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,66 @@ def test_auto_schema(self):
414414
for name in out.column_names:
415415
self.assertEqual(data[name], out[name].cast(data[name].type))
416416

417+
def _test_auto_schema_list(self, docs, expected):
418+
self.coll.delete_many({})
419+
self.coll.insert_many(docs)
420+
actual = find_arrow_all(self.coll, {}, projection={"_id": 0})
421+
self.assertEqual(actual.schema, expected.schema)
422+
self.assertEqual(actual, expected)
423+
424+
def test_auto_schema_first_list_null(self):
425+
docs = [
426+
{"a": None},
427+
{"a": ["str"]},
428+
{"a": []},
429+
]
430+
expected = pyarrow.Table.from_pylist(docs)
431+
self._test_auto_schema_list(docs, expected)
432+
433+
def test_auto_schema_first_list_empty(self):
434+
docs = [
435+
{"a": []},
436+
{"a": ["str"]},
437+
{"a": []},
438+
]
439+
expected = pyarrow.Table.from_pylist(
440+
[
441+
{"a": None}, # TODO: We incorrectly set the first empty list to null.
442+
{"a": ["str"]},
443+
{"a": []},
444+
]
445+
)
446+
self._test_auto_schema_list(docs, expected)
447+
448+
def test_auto_schema_first_list_element_null(self):
449+
docs = [
450+
{"a": None},
451+
{"a": [None, None, "str"]}, # Inferred schema should use the first non-null element.
452+
{"a": []},
453+
]
454+
expected = pyarrow.Table.from_pylist(docs)
455+
self._test_auto_schema_list(docs, expected)
456+
457+
@unittest.expectedFailure # TODO: Our inferred value for the first a.b field differs from pyarrow's.
458+
def test_auto_schema_first_embedded_list_null(self):
459+
docs = [
460+
{"a": {"b": None}},
461+
{"a": {"b": ["str"]}},
462+
{"a": {"b": []}},
463+
]
464+
expected = pyarrow.Table.from_pylist(docs)
465+
self._test_auto_schema_list(docs, expected)
466+
467+
@unittest.expectedFailure # TODO: Our inferred value for the first a.b field differs from pyarrow's.
468+
def test_auto_schema_first_embedded_doc_null(self):
469+
docs = [
470+
{"a": {"b": None}},
471+
{"a": {"b": "str"}},
472+
{"a": {"b": None}},
473+
]
474+
expected = pyarrow.Table.from_pylist(docs)
475+
self._test_auto_schema_list(docs, expected)
476+
417477
def test_auto_schema_heterogeneous(self):
418478
vals = [1, "2", True, 4]
419479
data = [{"a": v} for v in vals]

0 commit comments

Comments
 (0)