Skip to content

Commit 37ac8ee

Browse files
authored
INTPYTHON-418 Fix handling of missing document fields (#254)
1 parent 7c4cf76 commit 37ac8ee

File tree

3 files changed

+37
-3
lines changed

3 files changed

+37
-3
lines changed

bindings/python/pymongoarrow/context.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ def __init__(self, schema, codec_options=None):
4141
from pymongoarrow.lib import BuilderManager
4242

4343
self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo)
44+
self.schema_map = schema_map
4445

4546
def process_bson_stream(self, stream):
4647
self.manager.process_bson_stream(stream, len(stream))
@@ -59,8 +60,15 @@ def _parse_builder_map(builder_map):
5960
# Traverse the builder map right to left.
6061
for key, value in reversed(builder_map.items()):
6162
if value.type_marker == _BsonArrowTypes.document.value:
62-
names = value.finish()
63-
full_names = [f"{key}.{name}" for name in names]
63+
names = []
64+
full_names = []
65+
for candidate in list(builder_map):
66+
if candidate.startswith(key + "."):
67+
name = candidate[len(key) + 1 :]
68+
if "." in name or "[" in name:
69+
continue
70+
names.append(name)
71+
full_names.append(candidate)
6472
arrs = [builder_map[c] for c in full_names]
6573
builder_map[key] = StructArray.from_arrays(arrs, names=names)
6674
to_remove.extend(full_names)

bindings/python/pymongoarrow/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ cdef class BuilderManager:
8484
# We only use the doc_iter for binary arrays, which are handled already.
8585
self.get_builder(name, ftype, <bson_iter_t *>nullptr)
8686

87-
cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter) except *:
87+
cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter):
8888
cdef _ArrayBuilderBase builder = None
8989
cdef bson_subtype_t subtype
9090
cdef const uint8_t *val_buf = NULL

bindings/python/test/test_arrow.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,32 @@ def test_string_bool(self):
466466
),
467467
)
468468

469+
def test_schema_missing_field(self):
470+
self.coll.drop()
471+
self.coll.insert_one(
472+
{
473+
"_id": ObjectId("000000000000000000000013"),
474+
"list_field": [{"name": "Test1", "test": "Test2"}],
475+
}
476+
)
477+
478+
schema = Schema(
479+
{
480+
"_id": ObjectId,
481+
"list_field": [
482+
{
483+
"name": pa.string(),
484+
"test": pa.string(),
485+
"test_test": pa.string(), # does not exist in the database collection
486+
}
487+
],
488+
}
489+
)
490+
expected = [[{"name": "Test1", "test": "Test2", "test_test": None}]]
491+
for func in [find_arrow_all, aggregate_arrow_all]:
492+
out = func(self.coll, {} if func == find_arrow_all else [], schema=schema).drop(["_id"])
493+
self.assertEqual(out["list_field"].to_pylist(), expected)
494+
469495
def test_auto_schema_nested(self):
470496
# Create table with random data of various types.
471497
_, data = self._create_nested_data()

0 commit comments

Comments
 (0)