Skip to content

Commit 58bdfc9

Browse files
committed
continue refactoring
1 parent 246825e commit 58bdfc9

File tree

5 files changed

+315
-113
lines changed

5 files changed

+315
-113
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ repos:
9898
rev: v0.16.2
9999
hooks:
100100
- id: cython-lint
101+
args: ["--no-pycodestyle"]
101102

102103
- repo: https://github.com/codespell-project/codespell
103104
rev: "v2.2.6"

arrow-165.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
from pymongoarrow.lib import StringBuilder, Int32Builder
2+
from pyarrow import ListArray, StructArray
3+
import pyarrow as pa
4+
5+
6+
# Single list.
7+
a = StringBuilder()
8+
a.append("a")
9+
a.append("b")
10+
a.append("c")
11+
a.append("d")
12+
arr = ListArray.from_arrays(pa.array([0, 2, 4]), a.finish())
13+
# print(arr)
14+
# [
15+
# [
16+
# "a",
17+
# "b"
18+
# ],
19+
# [
20+
# "c",
21+
# "d"
22+
# ]
23+
# ]
24+
25+
# Document with strings.
26+
a = StringBuilder()
27+
b = StringBuilder()
28+
a.append("foo")
29+
a.append("bar")
30+
b.append("fizz")
31+
b.append("buzz")
32+
arr = StructArray.from_arrays((a.finish(), b.finish()), names=("a", "b"))
33+
# print(arr)
34+
# -- is_valid: all not null
35+
# -- child 0 type: string
36+
# [
37+
# "foo",
38+
# "bar"
39+
# ]
40+
# -- child 1 type: string
41+
# [
42+
# "fizz",
43+
# "buzz"
44+
# ]
45+
46+
# Nested document
47+
a = StringBuilder()
48+
b = StringBuilder()
49+
a.append("foo")
50+
a.append("bar")
51+
b.append("fizz")
52+
b.append("buzz")
53+
b_struct = StructArray.from_arrays((b.finish(),), names=("b",))
54+
arr = StructArray.from_arrays((a.finish(), b_struct), names=("a", "c"))
55+
# print(arr)
56+
# -- is_valid: all not null
57+
# -- child 0 type: string
58+
# [
59+
# "foo",
60+
# "bar"
61+
# ]
62+
# -- child 1 type: struct<b: string>
63+
# -- is_valid: all not null
64+
# -- child 0 type: string
65+
# [
66+
# "fizz",
67+
# "buzz"
68+
# ]
69+
70+
# Nested list
71+
a = StringBuilder()
72+
b = StringBuilder()
73+
a.append("foo")
74+
a.append("bar")
75+
b.append("fizz")
76+
b.append("buzz")
77+
inner_struct = StructArray.from_arrays((a.finish(), b.finish()), names=("a", "b"))
78+
arr = ListArray.from_arrays(pa.array([0, 2]), inner_struct)
79+
# print(arr)
80+
# [
81+
# -- is_valid: all not null
82+
# -- child 0 type: string
83+
# [
84+
# "foo",
85+
# "bar"
86+
# ]
87+
# -- child 1 type: string
88+
# [
89+
# "fizz",
90+
# "buzz"
91+
# ]
92+
# ]
93+
# print(arr[0])
94+
# <pyarrow.ListScalar: [{'a': 'foo', 'b': 'fizz'}, {'a': 'bar', 'b': 'buzz'}]>
95+
96+
"""
97+
# General algorithm:
98+
We start with a base key, which is empty to start
99+
If we get an atomic type, get or create that builder
100+
If we get a document, we add ".{key}" to the base key and read in that document
101+
If we get a list, we have a single builder for that list, but we also need to store the offsets
102+
So, we have an offsets map which is a list of offsets by key
103+
"""
104+
105+
106+
class ListBuilder(Int32Builder):
107+
def __init__(self):
108+
self._count = 0
109+
super().__init__()
110+
111+
def append_offset(self):
112+
super().append(self._count)
113+
114+
def append(self):
115+
self._count += 1
116+
117+
def finish(self):
118+
self.append_offset()
119+
return super().finish()
120+
121+
122+
class DocumentBuilder:
123+
def __init__(self):
124+
self._names = set()
125+
126+
def add_child(self, name):
127+
self._names.add(name)
128+
129+
def finish(self):
130+
return self._names
131+
132+
133+
def parse_doc(doc, builder_map, base_key=""):
134+
# Container logic.
135+
parent_builder = None
136+
if base_key and base_key in builder_map:
137+
parent_builder = builder_map[base_key]
138+
if isinstance(parent_builder, ListBuilder):
139+
parent_builder.append_offset()
140+
141+
for key, value in doc.items():
142+
# Container item logic.
143+
if isinstance(parent_builder, ListBuilder):
144+
full_key = base_key + "[]"
145+
parent_builder.append()
146+
147+
elif isinstance(parent_builder, DocumentBuilder):
148+
full_key = f"{base_key}.{key}"
149+
parent_builder.add_child(key)
150+
151+
else:
152+
full_key = key
153+
154+
# Builder detection logic.
155+
if full_key not in builder_map:
156+
if isinstance(value, dict):
157+
builder_map[full_key] = DocumentBuilder()
158+
elif isinstance(value, list):
159+
builder_map[full_key] = ListBuilder()
160+
else:
161+
builder_map[full_key] = StringBuilder()
162+
163+
builder = builder_map[full_key]
164+
165+
# Builder append logic.
166+
if isinstance(value, dict):
167+
parse_doc(value, builder_map, full_key)
168+
169+
elif isinstance(value, list):
170+
keys = (f"{i}" for i in range(len(value)))
171+
parse_doc(dict(zip(keys, value)), builder_map, full_key)
172+
173+
else:
174+
builder.append(value)
175+
176+
177+
def parse_builder_map(builder_map):
178+
# Traverse the builder map right to left.
179+
to_remove = []
180+
for key, value in reversed(builder_map.items()):
181+
arr = value.finish()
182+
if isinstance(value, DocumentBuilder):
183+
full_names = [f"{key}.{name}" for name in arr]
184+
arrs = list(builder_map[c] for c in full_names)
185+
builder_map[key] = StructArray.from_arrays(arrs, names=arr)
186+
to_remove.extend(full_names)
187+
elif isinstance(value, ListBuilder):
188+
child = key + "[]"
189+
to_remove.append(child)
190+
builder_map[key] = ListArray.from_arrays(arr, builder_map.get(child, []))
191+
else:
192+
builder_map[key] = arr
193+
194+
for key in to_remove:
195+
if key in builder_map:
196+
del builder_map[key]
197+
198+
return pa.Table.from_arrays(
199+
arrays=list(builder_map.values()), names=list(builder_map.keys())
200+
)
201+
202+
203+
def main():
204+
doc = dict(
205+
a="a",
206+
b="a",
207+
c=dict(c="c", d=["1", "2", "3"]),
208+
d=[dict(a="1"), dict(a="2")],
209+
e=[["1", "2"]],
210+
f=[],
211+
)
212+
builder_map = dict()
213+
parse_doc(doc, builder_map)
214+
parse_doc(doc, builder_map)
215+
print(parse_builder_map(builder_map))
216+
217+
218+
if __name__ == "__main__":
219+
main()

bindings/python/pymongoarrow/context.py

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
from pyarrow import ListArray, StructArray, Table
14+
from pyarrow import ListArray, StructArray, Table, timestamp
1515
from pyarrow.types import is_struct
1616

1717
from pymongoarrow.types import _BsonArrowTypes, _get_internal_typemap
@@ -73,55 +73,55 @@ def __init__(self, schema, codec_options=None):
7373
self.tzinfo = codec_options.tzinfo
7474
else:
7575
self.tzinfo = None
76-
self.manager = BuilderManager(self.schema is not None, self.tzinfo)
76+
builder_map = {}
7777
if self.schema is not None:
78-
schema_map = {}
7978
str_type_map = _get_internal_typemap(schema.typemap)
80-
_parse_types(str_type_map, schema_map, self.tzinfo)
81-
self.manager.parse_types(schema_map)
79+
_parse_types(str_type_map, builder_map, self.tzinfo)
80+
81+
self.manager = BuilderManager(builder_map, self.schema is not None, self.tzinfo)
8282

8383
def process_bson_stream(self, stream):
8484
self.manager.process_bson_stream(stream, len(stream))
8585

8686
def finish(self):
87-
builder_map = self.manager.finish().copy()
88-
89-
# Handle nested builders.
90-
to_remove = []
91-
# Traverse the builder map right to left.
92-
for key, value in reversed(builder_map.items()):
93-
field = key.decode("utf-8")
94-
if isinstance(value, DocumentBuilder):
95-
arr = value.finish()
96-
full_names = [f"{field}.{name.decode('utf-8')}" for name in arr]
97-
arrs = [builder_map[c.encode("utf-8")] for c in full_names]
98-
builder_map[field] = StructArray.from_arrays(arrs, names=arr)
99-
to_remove.extend(full_names)
100-
elif isinstance(value, ListBuilder):
101-
arr = value.finish()
102-
child_name = field + "[]"
103-
to_remove.append(child_name)
104-
child = builder_map[child_name.encode("utf-8")]
105-
builder_map[key] = ListArray.from_arrays(arr, child)
106-
else:
107-
builder_map[key] = value.finish()
108-
109-
for field in to_remove:
110-
key = field.encode("utf-8")
111-
if key in builder_map:
112-
del builder_map[key]
113-
87+
builder_map = _parse_builder_map(self.manager.finish())
11488
arrays = list(builder_map.values())
11589
if self.schema is not None:
11690
return Table.from_arrays(arrays=arrays, schema=self.schema.to_arrow())
11791
return Table.from_arrays(arrays=arrays, names=list(builder_map.keys()))
11892

11993

120-
def _parse_types(str_type_map, schema_map, tzinfo):
94+
def _parse_builder_map(builder_map):
95+
# Handle nested builders.
96+
to_remove = []
97+
# Traverse the builder map right to left.
98+
for key, value in reversed(builder_map.items()):
99+
field = key.decode("utf-8")
100+
if isinstance(value, DocumentBuilder):
101+
arr = value.finish()
102+
full_names = [f"{field}.{name.decode('utf-8')}" for name in arr]
103+
arrs = [builder_map[c.encode("utf-8")] for c in full_names]
104+
builder_map[field] = StructArray.from_arrays(arrs, names=arr)
105+
to_remove.extend(full_names)
106+
elif isinstance(value, ListBuilder):
107+
arr = value.finish()
108+
child_name = field + "[]"
109+
to_remove.append(child_name)
110+
child = builder_map[child_name.encode("utf-8")]
111+
builder_map[key] = ListArray.from_arrays(arr, child)
112+
else:
113+
builder_map[key] = value.finish()
114+
115+
for field in to_remove:
116+
key = field.encode("utf-8")
117+
if key in builder_map:
118+
del builder_map[key]
119+
120+
121+
def _parse_types(str_type_map, builder_map, tzinfo):
121122
for fname, (ftype, arrow_type) in str_type_map.items():
122123
builder_cls = _TYPE_TO_BUILDER_CLS[ftype]
123124
encoded_fname = fname.encode("utf-8")
124-
schema_map[encoded_fname] = (arrow_type, builder_cls)
125125

126126
# special-case nested builders
127127
if builder_cls == DocumentBuilder:
@@ -132,6 +132,7 @@ def _parse_types(str_type_map, schema_map, tzinfo):
132132
sub_name = f"{fname}.{field.name}"
133133
sub_type_map[sub_name] = field.type
134134
sub_type_map = _get_internal_typemap(sub_type_map)
135+
_parse_types(sub_type_map, builder_map, tzinfo)
135136
elif builder_cls == ListBuilder:
136137
if is_struct(arrow_type.value_type):
137138
# construct a sub type map here
@@ -141,4 +142,15 @@ def _parse_types(str_type_map, schema_map, tzinfo):
141142
sub_name = f"{fname}[].{field.name}"
142143
sub_type_map[sub_name] = field.type
143144
sub_type_map = _get_internal_typemap(sub_type_map)
144-
_parse_types(sub_type_map, schema_map, tzinfo)
145+
_parse_types(sub_type_map, sub_type_map, tzinfo)
146+
147+
# special-case initializing builders for parameterized types
148+
if builder_cls == DatetimeBuilder:
149+
if tzinfo is not None and arrow_type.tz is None:
150+
arrow_type = timestamp(arrow_type.unit, tz=tzinfo) # noqa: PLW2901
151+
builder_map[encoded_fname] = DatetimeBuilder(dtype=arrow_type)
152+
elif builder_cls == BinaryBuilder:
153+
subtype = arrow_type.subtype
154+
builder_map[fname] = BinaryBuilder(subtype)
155+
else:
156+
builder_map[fname] = builder_cls()

0 commit comments

Comments
 (0)