Skip to content

Commit 84dd93e

Browse files
blink1073bruno-firnkes
authored andcommitted
INTPYTHON-418 Fix handling of missing document fields (mongodb-labs#254)
1 parent 9bfefea commit 84dd93e

File tree

3 files changed

+37
-3
lines changed

3 files changed

+37
-3
lines changed

bindings/python/pymongoarrow/context.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
5151
>>>>>>> 5406fc3 (INTPYTHON-165 Refactor nested data handling (#245))
5252

5353
self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo)
54+
self.schema_map = schema_map
5455

5556
def process_bson_stream(self, stream):
5657
self.manager.process_bson_stream(stream, len(stream))
@@ -69,8 +70,15 @@ def _parse_builder_map(builder_map):
6970
# Traverse the builder map right to left.
7071
for key, value in reversed(builder_map.items()):
7172
if value.type_marker == _BsonArrowTypes.document.value:
72-
names = value.finish()
73-
full_names = [f"{key}.{name}" for name in names]
73+
names = []
74+
full_names = []
75+
for candidate in list(builder_map):
76+
if candidate.startswith(key + "."):
77+
name = candidate[len(key) + 1 :]
78+
if "." in name or "[" in name:
79+
continue
80+
names.append(name)
81+
full_names.append(candidate)
7482
arrs = [builder_map[c] for c in full_names]
7583
builder_map[key] = StructArray.from_arrays(arrs, names=names)
7684
to_remove.extend(full_names)

bindings/python/pymongoarrow/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ cdef class BuilderManager:
8484
# We only use the doc_iter for binary arrays, which are handled already.
8585
self.get_builder(name, ftype, <bson_iter_t *>nullptr)
8686

87-
cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter) except *:
87+
cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter):
8888
cdef _ArrayBuilderBase builder = None
8989
cdef bson_subtype_t subtype
9090
cdef const uint8_t *val_buf = NULL

bindings/python/test/test_arrow.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,32 @@ def test_string_bool(self):
466466
),
467467
)
468468

469+
def test_schema_missing_field(self):
470+
self.coll.drop()
471+
self.coll.insert_one(
472+
{
473+
"_id": ObjectId("000000000000000000000013"),
474+
"list_field": [{"name": "Test1", "test": "Test2"}],
475+
}
476+
)
477+
478+
schema = Schema(
479+
{
480+
"_id": ObjectId,
481+
"list_field": [
482+
{
483+
"name": pa.string(),
484+
"test": pa.string(),
485+
"test_test": pa.string(), # does not exist in the database collection
486+
}
487+
],
488+
}
489+
)
490+
expected = [[{"name": "Test1", "test": "Test2", "test_test": None}]]
491+
for func in [find_arrow_all, aggregate_arrow_all]:
492+
out = func(self.coll, {} if func == find_arrow_all else [], schema=schema).drop(["_id"])
493+
self.assertEqual(out["list_field"].to_pylist(), expected)
494+
469495
def test_auto_schema_nested(self):
470496
# Create table with random data of various types.
471497
_, data = self._create_nested_data()

0 commit comments

Comments
 (0)