Skip to content

Commit 210dd6c

Browse files
committed
further progress on buildermanager
1 parent 58bdfc9 commit 210dd6c

File tree

5 files changed

+117
-335
lines changed

5 files changed

+117
-335
lines changed

arrow-165.py

Lines changed: 0 additions & 219 deletions
This file was deleted.

bindings/python/pymongoarrow/context.py

Lines changed: 29 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -11,51 +11,11 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from pyarrow import ListArray, StructArray, Table, timestamp
14+
from pyarrow import ListArray, StructArray, Table
1515
from pyarrow.types import is_struct
1616

1717
from pymongoarrow.types import _BsonArrowTypes, _get_internal_typemap
1818

19-
try:
20-
from pymongoarrow.lib import (
21-
BinaryBuilder,
22-
BoolBuilder,
23-
BuilderManager,
24-
CodeBuilder,
25-
Date32Builder,
26-
Date64Builder,
27-
DatetimeBuilder,
28-
Decimal128Builder,
29-
DocumentBuilder,
30-
DoubleBuilder,
31-
Int32Builder,
32-
Int64Builder,
33-
ListBuilder,
34-
NullBuilder,
35-
ObjectIdBuilder,
36-
StringBuilder,
37-
)
38-
39-
_TYPE_TO_BUILDER_CLS = {
40-
_BsonArrowTypes.int32: Int32Builder,
41-
_BsonArrowTypes.int64: Int64Builder,
42-
_BsonArrowTypes.double: DoubleBuilder,
43-
_BsonArrowTypes.datetime: DatetimeBuilder,
44-
_BsonArrowTypes.objectid: ObjectIdBuilder,
45-
_BsonArrowTypes.decimal128: Decimal128Builder,
46-
_BsonArrowTypes.string: StringBuilder,
47-
_BsonArrowTypes.bool: BoolBuilder,
48-
_BsonArrowTypes.document: DocumentBuilder,
49-
_BsonArrowTypes.array: ListBuilder,
50-
_BsonArrowTypes.binary: BinaryBuilder,
51-
_BsonArrowTypes.code: CodeBuilder,
52-
_BsonArrowTypes.date32: Date32Builder,
53-
_BsonArrowTypes.date64: Date64Builder,
54-
_BsonArrowTypes.null: NullBuilder,
55-
}
56-
except ImportError:
57-
pass
58-
5919

6020
class PyMongoArrowContext:
6121
"""A context for converting BSON-formatted data to an Arrow Table."""
@@ -73,67 +33,68 @@ def __init__(self, schema, codec_options=None):
7333
self.tzinfo = codec_options.tzinfo
7434
else:
7535
self.tzinfo = None
76-
builder_map = {}
36+
schema_map = {}
7737
if self.schema is not None:
7838
str_type_map = _get_internal_typemap(schema.typemap)
79-
_parse_types(str_type_map, builder_map, self.tzinfo)
39+
_parse_types(str_type_map, schema_map, self.tzinfo)
40+
41+
# Delayed import to prevent import errors for unbuilt library.
42+
from pymongoarrow.lib import BuilderManager
8043

81-
self.manager = BuilderManager(builder_map, self.schema is not None, self.tzinfo)
44+
self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo)
8245

8346
def process_bson_stream(self, stream):
8447
self.manager.process_bson_stream(stream, len(stream))
8548

8649
def finish(self):
87-
builder_map = _parse_builder_map(self.manager.finish())
88-
arrays = list(builder_map.values())
50+
array_map = _parse_array_map(self.manager.finish())
51+
arrays = list(array_map.values())
8952
if self.schema is not None:
9053
return Table.from_arrays(arrays=arrays, schema=self.schema.to_arrow())
91-
return Table.from_arrays(arrays=arrays, names=list(builder_map.keys()))
54+
return Table.from_arrays(arrays=arrays, names=list(array_map.keys()))
9255

9356

94-
def _parse_builder_map(builder_map):
57+
def _parse_array_map(array_map):
9558
# Handle nested builders.
9659
to_remove = []
9760
# Traverse the builder map right to left.
98-
for key, value in reversed(builder_map.items()):
61+
for key, value in reversed(array_map.items()):
9962
field = key.decode("utf-8")
100-
if isinstance(value, DocumentBuilder):
101-
arr = value.finish()
102-
full_names = [f"{field}.{name.decode('utf-8')}" for name in arr]
103-
arrs = [builder_map[c.encode("utf-8")] for c in full_names]
104-
builder_map[field] = StructArray.from_arrays(arrs, names=arr)
63+
if value.type_marker == _BsonArrowTypes.document:
64+
full_names = [f"{field}.{name.decode('utf-8')}" for name in value]
65+
arrs = [array_map[c.encode("utf-8")] for c in full_names]
66+
array_map[field] = StructArray.from_arrays(arrs, names=value)
10567
to_remove.extend(full_names)
106-
elif isinstance(value, ListBuilder):
107-
arr = value.finish()
68+
elif value.type_marker == _BsonArrowTypes.array:
10869
child_name = field + "[]"
10970
to_remove.append(child_name)
110-
child = builder_map[child_name.encode("utf-8")]
111-
builder_map[key] = ListArray.from_arrays(arr, child)
112-
else:
113-
builder_map[key] = value.finish()
71+
child = array_map[child_name.encode("utf-8")]
72+
array_map[key] = ListArray.from_arrays(value, child)
11473

11574
for field in to_remove:
11675
key = field.encode("utf-8")
117-
if key in builder_map:
118-
del builder_map[key]
76+
if key in array_map:
77+
del array_map[key]
11978

79+
return array_map
12080

121-
def _parse_types(str_type_map, builder_map, tzinfo):
81+
82+
def _parse_types(str_type_map, schema_map, tzinfo):
12283
for fname, (ftype, arrow_type) in str_type_map.items():
123-
builder_cls = _TYPE_TO_BUILDER_CLS[ftype]
12484
encoded_fname = fname.encode("utf-8")
85+
schema_map[encoded_fname] = ftype, arrow_type
12586

12687
# special-case nested builders
127-
if builder_cls == DocumentBuilder:
88+
if ftype == _BsonArrowTypes.document:
12889
# construct a sub type map here
12990
sub_type_map = {}
13091
for i in range(arrow_type.num_fields):
13192
field = arrow_type[i]
13293
sub_name = f"{fname}.{field.name}"
13394
sub_type_map[sub_name] = field.type
13495
sub_type_map = _get_internal_typemap(sub_type_map)
135-
_parse_types(sub_type_map, builder_map, tzinfo)
136-
elif builder_cls == ListBuilder:
96+
_parse_types(sub_type_map, schema_map, tzinfo)
97+
elif ftype == _BsonArrowTypes.array:
13798
if is_struct(arrow_type.value_type):
13899
# construct a sub type map here
139100
sub_type_map = {}
@@ -142,15 +103,4 @@ def _parse_types(str_type_map, builder_map, tzinfo):
142103
sub_name = f"{fname}[].{field.name}"
143104
sub_type_map[sub_name] = field.type
144105
sub_type_map = _get_internal_typemap(sub_type_map)
145-
_parse_types(sub_type_map, sub_type_map, tzinfo)
146-
147-
# special-case initializing builders for parameterized types
148-
if builder_cls == DatetimeBuilder:
149-
if tzinfo is not None and arrow_type.tz is None:
150-
arrow_type = timestamp(arrow_type.unit, tz=tzinfo) # noqa: PLW2901
151-
builder_map[encoded_fname] = DatetimeBuilder(dtype=arrow_type)
152-
elif builder_cls == BinaryBuilder:
153-
subtype = arrow_type.subtype
154-
builder_map[fname] = BinaryBuilder(subtype)
155-
else:
156-
builder_map[fname] = builder_cls()
106+
_parse_types(sub_type_map, schema_map, tzinfo)

0 commit comments

Comments
 (0)