diff --git a/bindings/python/pymongoarrow/context.py b/bindings/python/pymongoarrow/context.py index e6da7abf..4672d637 100644 --- a/bindings/python/pymongoarrow/context.py +++ b/bindings/python/pymongoarrow/context.py @@ -41,6 +41,7 @@ def __init__(self, schema, codec_options=None): from pymongoarrow.lib import BuilderManager self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo) + self.schema_map = schema_map def process_bson_stream(self, stream): self.manager.process_bson_stream(stream, len(stream)) @@ -59,8 +60,15 @@ def _parse_builder_map(builder_map): # Traverse the builder map right to left. for key, value in reversed(builder_map.items()): if value.type_marker == _BsonArrowTypes.document.value: - names = value.finish() - full_names = [f"{key}.{name}" for name in names] + names = [] + full_names = [] + for candidate in list(builder_map): + if candidate.startswith(key + "."): + name = candidate[len(key) + 1 :] + if "." in name or "[" in name: + continue + names.append(name) + full_names.append(candidate) arrs = [builder_map[c] for c in full_names] builder_map[key] = StructArray.from_arrays(arrs, names=names) to_remove.extend(full_names) diff --git a/bindings/python/pymongoarrow/lib.pyx b/bindings/python/pymongoarrow/lib.pyx index 802ee829..58d45de1 100644 --- a/bindings/python/pymongoarrow/lib.pyx +++ b/bindings/python/pymongoarrow/lib.pyx @@ -84,7 +84,7 @@ cdef class BuilderManager: # We only use the doc_iter for binary arrays, which are handled already. self.get_builder(name, ftype, nullptr) - cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter) except *: + cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter): cdef _ArrayBuilderBase builder = None cdef bson_subtype_t subtype cdef const uint8_t *val_buf = NULL diff --git a/bindings/python/test/test_arrow.py b/bindings/python/test/test_arrow.py index cc831518..53bef0e2 100644 --- a/bindings/python/test/test_arrow.py +++ b/bindings/python/test/test_arrow.py @@ -466,6 +466,32 @@ def test_string_bool(self): ), ) + def test_schema_missing_field(self): + self.coll.drop() + self.coll.insert_one( + { + "_id": ObjectId("000000000000000000000013"), + "list_field": [{"name": "Test1", "test": "Test2"}], + } + ) + + schema = Schema( + { + "_id": ObjectId, + "list_field": [ + { + "name": pa.string(), + "test": pa.string(), + "test_test": pa.string(), # does not exist in the database collection + } + ], + } + ) + expected = [[{"name": "Test1", "test": "Test2", "test_test": None}]] + for func in [find_arrow_all, aggregate_arrow_all]: + out = func(self.coll, {} if func == find_arrow_all else [], schema=schema).drop(["_id"]) + self.assertEqual(out["list_field"].to_pylist(), expected) + def test_auto_schema_nested(self): # Create table with random data of various types. _, data = self._create_nested_data()