Skip to content

Commit 1e25090

Browse files
authored
ARROW-181 Revert nested document behavior and document limitation (#173)
* Revert "ARROW-176 Nested extension objects are not handled in auto schema (#166)" This reverts commit 1cde144. * ARROW-181 Revert nested document behavior and document limitation
1 parent 05a2199 commit 1e25090

File tree

3 files changed

+38
-34
lines changed

3 files changed

+38
-34
lines changed

bindings/python/docs/source/data_types.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,3 +162,13 @@ with the code below (taken from `here <https://arrow.apache.org/docs/python/pand
162162
... del arrow_table
163163
164164
Defining a conversion for `pa.string()` in addition converts Arrow strings to NumPy strings, and not objects.
165+
166+
Nested Extension Types
167+
----------------------
168+
169+
Pending ``ARROW-179``, extension types such as ``ObjectId`` that appear in nested documents will not
170+
be converted to the corresponding PyMongoArrow extension type, but will
171+
instead have the raw Arrow type (``FixedSizeBinaryType(fixed_size_binary[12])``).
172+
173+
These values can either be consumed as-is or converted individually to the
174+
desired extension type, e.g. ``_id = out['nested'][0]['_id'].cast(ObjectIdType())``.

bindings/python/pymongoarrow/lib.pyx

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,7 @@ cdef class ObjectIdBuilder(_ArrayBuilderBase):
490490
cdef shared_ptr[CFixedSizeBinaryBuilder] unwrap(self):
491491
return self.builder
492492

493+
493494
cdef class Int32Builder(_ArrayBuilderBase):
494495
cdef:
495496
shared_ptr[CInt32Builder] builder
@@ -721,8 +722,6 @@ cdef object get_field_builder(object field, object tzinfo):
721722
field_builder = Decimal128Builder()
722723
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.binary:
723724
field_builder = BinaryBuilder(field_type.subtype)
724-
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.code:
725-
field_builder = CodeBuilder()
726725
else:
727726
field_builder = StringBuilder()
728727
return field_builder
@@ -733,7 +732,6 @@ cdef class DocumentBuilder(_ArrayBuilderBase):
733732
shared_ptr[CStructBuilder] builder
734733
object dtype
735734
object context
736-
object builder_map
737735

738736
def __cinit__(self, StructType dtype, tzinfo=None, MemoryPool memory_pool=None):
739737
cdef StringBuilder field_builder
@@ -746,11 +744,11 @@ cdef class DocumentBuilder(_ArrayBuilderBase):
746744

747745
self.context = context = PyMongoArrowContext(None, {})
748746
context.tzinfo = tzinfo
749-
self.builder_map = context.builder_map
747+
builder_map = context.builder_map
750748

751749
for field in dtype:
752750
field_builder = <StringBuilder>get_field_builder(field, tzinfo)
753-
self.builder_map[field.name.encode('utf-8')] = field_builder
751+
builder_map[field.name.encode('utf-8')] = field_builder
754752
c_field_builders.push_back(<shared_ptr[CArrayBuilder]>field_builder.builder)
755753

756754
self.builder.reset(new CStructBuilder(pyarrow_unwrap_data_type(dtype), pool, c_field_builders))
@@ -783,30 +781,7 @@ cdef class DocumentBuilder(_ArrayBuilderBase):
783781
cdef shared_ptr[CArray] out
784782
with nogil:
785783
self.builder.get().Finish(&out)
786-
787-
struct_array = pyarrow_wrap_array(out)
788-
for struct_def in struct_array:
789-
new_types = []
790-
new_names = list(struct_def.keys())
791-
for fname, ftype in struct_def.items():
792-
builder_instance = self.builder_map[fname.encode('utf-8')]
793-
if isinstance(builder_instance, ObjectIdBuilder): # ObjectIdType
794-
new_ftype = ObjectIdType()
795-
new_types.append(new_ftype)
796-
elif isinstance(builder_instance, Decimal128Builder): # Decimal128Type
797-
new_ftype = Decimal128Type_()
798-
new_types.append(new_ftype)
799-
elif isinstance(builder_instance, BinaryBuilder): # BinaryType
800-
new_ftype = BinaryType(self.dtype.field(fname).type.subtype)
801-
new_types.append(new_ftype)
802-
elif isinstance(builder_instance, CodeBuilder): # CodeType
803-
new_ftype = CodeType()
804-
new_types.append(new_ftype)
805-
else:
806-
new_types.append(ftype.type)
807-
808-
new_dtype = struct(zip(new_names, new_types))
809-
return struct_array.cast(new_dtype)
784+
return pyarrow_wrap_array(out)
810785

811786
cdef shared_ptr[CStructBuilder] unwrap(self):
812787
return self.builder

bindings/python/test/test_arrow.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,18 @@
2222
import pyarrow
2323
import pymongo
2424
from bson import Binary, Code, CodecOptions, Decimal128, ObjectId
25-
from pyarrow import Table, bool_, csv, decimal256, field, int32, int64, list_
25+
from pyarrow import (
26+
DataType,
27+
FixedSizeBinaryType,
28+
Table,
29+
bool_,
30+
csv,
31+
decimal256,
32+
field,
33+
int32,
34+
int64,
35+
list_,
36+
)
2637
from pyarrow import schema as ArrowSchema
2738
from pyarrow import string, struct, timestamp
2839
from pyarrow.parquet import read_table, write_table
@@ -665,10 +676,18 @@ def test_nested_bson_extension_types(self):
665676
out = find_arrow_all(self.coll, {})
666677
obj_schema_type = out.field("obj").type
667678

668-
self.assertIsInstance(obj_schema_type.field("obj_id").type, ObjectIdType)
669-
self.assertIsInstance(obj_schema_type.field("dec_128").type, Decimal128Type)
670-
self.assertIsInstance(obj_schema_type.field("binary").type, BinaryType)
671-
self.assertIsInstance(obj_schema_type.field("code").type, CodeType)
679+
self.assertIsInstance(obj_schema_type.field("obj_id").type, FixedSizeBinaryType)
680+
self.assertIsInstance(obj_schema_type.field("dec_128").type, FixedSizeBinaryType)
681+
self.assertIsInstance(obj_schema_type.field("binary").type, DataType)
682+
self.assertIsInstance(obj_schema_type.field("code").type, DataType)
683+
684+
new_types = [ObjectIdType(), Decimal128Type(), BinaryType(0), CodeType()]
685+
new_names = [f.name for f in out["obj"].type]
686+
new_obj = out["obj"].cast(struct(zip(new_names, new_types)))
687+
self.assertIsInstance(new_obj.type[0].type, ObjectIdType)
688+
self.assertIsInstance(new_obj.type[1].type, Decimal128Type)
689+
self.assertIsInstance(new_obj.type[2].type, BinaryType)
690+
self.assertIsInstance(new_obj.type[3].type, CodeType)
672691

673692

674693
class TestArrowExplicitApi(ArrowApiTestMixin, unittest.TestCase):

0 commit comments

Comments
 (0)