Skip to content

Commit 490b980

Browse files
authored
feat(python): Add visitor pattern + builders for column sequences (#454)
Assembling columns from chunked things is rather difficult to do and is a valid thing that somebody might want to assemble from Arrow data. This PR adds a "visitor" pattern that can be extended to build "column"s, which are currently just `list()`s. Before trimming down this PR to a managable set of changes, I also implemented the visitor that concatenates data buffers for single data buffer types ( https://gist.github.com/paleolimbot/17263e38b5d97c770e44d33b11181eaf ), which will be needed for `to_columns()` to be used in any kind of serious way. To support the "visitor" pattern, I moved some of the `PyIterator`-specific pieces into the `PyIterator` so that the visitor can re-use the relevant pieces of `ArrayViewBaseIterator`. This pattern also solves one of the problems I had when attempting a "repr" iterator, which is that I was trying to build something rather than iterate over it. ```python import nanoarrow as na import pandas as pd from nanoarrow import visitor url = "https://github.com/apache/arrow-experiments/raw/main/data/arrow-commits/arrow-commits.arrows" array = na.ArrayStream.from_url(url).read_all() # to_columns() doesn't (and won't) produce anything numpy or pandas-related names, columns = visitor.to_columns(array) # ..but lets data frames be built rather compactly pd.DataFrame({k: v for k, v in zip(names, columns)}) ```
1 parent 197f117 commit 490b980

File tree

7 files changed

+414
-43
lines changed

7 files changed

+414
-43
lines changed

python/src/nanoarrow/array.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import itertools
1919
from functools import cached_property
20-
from typing import Iterable, Tuple
20+
from typing import Iterable, List, Sequence, Tuple
2121

2222
from nanoarrow._lib import (
2323
DEVICE_CPU,
@@ -32,6 +32,7 @@
3232
from nanoarrow.c_schema import c_schema
3333
from nanoarrow.iterator import iter_array_views, iter_py, iter_tuples
3434
from nanoarrow.schema import Schema, _schema_repr
35+
from nanoarrow.visitor import to_columns, to_pylist
3536

3637
from nanoarrow import _repr_utils
3738

@@ -344,6 +345,42 @@ def iter_chunk_views(self) -> Iterable[CArrayView]:
344345
"""
345346
return iter_array_views(self)
346347

348+
def to_pylist(self) -> List:
349+
"""Convert this Array to a ``list()` of Python objects
350+
351+
Computes an identical value to list(:meth:`iter_py`) but can be several
352+
times faster.
353+
354+
Examples
355+
--------
356+
357+
>>> import nanoarrow as na
358+
>>> array = na.Array([1, 2, 3], na.int32())
359+
>>> array.to_pylist()
360+
[1, 2, 3]
361+
"""
362+
return to_pylist(self)
363+
364+
def to_columns(self) -> Tuple[str, Sequence]:
365+
"""Convert this Array to a ``list()` of sequences
366+
367+
Converts a stream of struct arrays into its column-wise representation
368+
such that each column is either a contiguous buffer or a ``list()``.
369+
370+
Examples
371+
--------
372+
373+
>>> import nanoarrow as na
374+
>>> import pyarrow as pa
375+
>>> array = na.Array(pa.record_batch([pa.array([1, 2, 3])], names=["col1"]))
376+
>>> names, columns = array.to_columns()
377+
>>> names
378+
['col1']
379+
>>> columns
380+
[[1, 2, 3]]
381+
"""
382+
return to_columns(self)
383+
347384
@property
348385
def n_children(self) -> int:
349386
"""Get the number of children for an Array of this type.

python/src/nanoarrow/array_stream.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,15 @@
1616
# under the License.
1717

1818
from functools import cached_property
19-
from typing import Iterable, Tuple
19+
from typing import Iterable, List, Sequence, Tuple
2020

2121
from nanoarrow._lib import CMaterializedArrayStream
2222
from nanoarrow._repr_utils import make_class_label
2323
from nanoarrow.array import Array
2424
from nanoarrow.c_array_stream import c_array_stream
2525
from nanoarrow.iterator import iter_py, iter_tuples
2626
from nanoarrow.schema import Schema, _schema_repr
27+
from nanoarrow.visitor import to_columns, to_pylist
2728

2829

2930
class ArrayStream:
@@ -198,6 +199,43 @@ def iter_tuples(self) -> Iterable[Tuple]:
198199
"""
199200
return iter_tuples(self)
200201

202+
def to_pylist(self) -> List:
203+
"""Convert this Array to a ``list()` of Python objects
204+
205+
Computes an identical value to list(:meth:`iter_py`) but can be several
206+
times faster.
207+
208+
Examples
209+
--------
210+
211+
>>> import nanoarrow as na
212+
>>> stream = na.ArrayStream([1, 2, 3], na.int32())
213+
>>> stream.to_pylist()
214+
[1, 2, 3]
215+
"""
216+
return to_pylist(self)
217+
218+
def to_columns(self) -> Tuple[str, Sequence]:
219+
"""Convert this Array to a ``list()` of sequences
220+
221+
Converts a stream of struct arrays into its column-wise representation
222+
such that each column is either a contiguous buffer or a ``list()``.
223+
224+
Examples
225+
--------
226+
227+
>>> import nanoarrow as na
228+
>>> import pyarrow as pa
229+
>>> batch = pa.record_batch([pa.array([1, 2, 3])], names=["col1"])
230+
>>> stream = na.ArrayStream(batch)
231+
>>> names, columns = stream.to_columns()
232+
>>> names
233+
['col1']
234+
>>> columns
235+
[[1, 2, 3]]
236+
"""
237+
return to_columns(self)
238+
201239
def __repr__(self) -> str:
202240
cls = make_class_label(self, "nanoarrow")
203241
schema_repr = _schema_repr(self.schema, prefix="", include_metadata=False)

python/src/nanoarrow/iterator.py

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from nanoarrow._lib import CArrayView, CArrowType
2424
from nanoarrow.c_array_stream import c_array_stream
2525
from nanoarrow.c_schema import c_schema, c_schema_view
26+
from nanoarrow.schema import Schema
2627

2728

2829
def iter_py(obj, schema=None) -> Iterable:
@@ -130,47 +131,22 @@ class UnregisteredExtensionWarning(UserWarning):
130131

131132

132133
class ArrayViewBaseIterator:
133-
"""Base class for iterators that use an internal ArrowArrayView
134+
"""Base class for iterators and visitors that use an internal ArrowArrayView
134135
as the basis for conversion to Python objects. Intended for internal use.
135136
"""
136137

137-
@classmethod
138-
def get_iterator(cls, obj, schema=None):
139-
with c_array_stream(obj, schema=schema) as stream:
140-
iterator = cls(stream._get_cached_schema())
141-
for array in stream:
142-
iterator._set_array(array)
143-
yield from iterator._iter_chunk(0, len(array))
144-
145-
def __init__(self, schema, *, _array_view=None):
138+
def __init__(self, schema, *, array_view=None):
146139
self._schema = c_schema(schema)
147140
self._schema_view = c_schema_view(schema)
148141

149-
if _array_view is None:
142+
if array_view is None:
150143
self._array_view = CArrayView.from_schema(self._schema)
151144
else:
152-
self._array_view = _array_view
153-
154-
self._children = list(
155-
map(self._make_child, self._schema.children, self._array_view.children)
156-
)
157-
158-
if self._schema.dictionary is None:
159-
self._dictionary = None
160-
else:
161-
self._dictionary = self._make_child(
162-
self._schema.dictionary, self._array_view.dictionary
163-
)
164-
165-
def _make_child(self, schema, array_view):
166-
return type(self)(schema, _array_view=array_view)
167-
168-
def _iter_chunk(self, offset, length) -> Iterable:
169-
yield self._array_view
145+
self._array_view = array_view
170146

171147
@cached_property
172-
def _child_names(self):
173-
return [child.name for child in self._schema.children]
148+
def schema(self) -> Schema:
149+
return Schema(self._schema)
174150

175151
@cached_property
176152
def _object_label(self):
@@ -199,7 +175,41 @@ class PyIterator(ArrayViewBaseIterator):
199175
Intended for internal use.
200176
"""
201177

178+
@classmethod
179+
def get_iterator(cls, obj, schema=None):
180+
with c_array_stream(obj, schema=schema) as stream:
181+
iterator = cls(stream._get_cached_schema())
182+
for array in stream:
183+
iterator._set_array(array)
184+
yield from iterator
185+
186+
def __init__(self, schema, *, array_view=None):
187+
super().__init__(schema, array_view=array_view)
188+
189+
self._children = list(
190+
map(self._make_child, self._schema.children, self._array_view.children)
191+
)
192+
193+
if self._schema.dictionary is None:
194+
self._dictionary = None
195+
else:
196+
self._dictionary = self._make_child(
197+
self._schema.dictionary, self._array_view.dictionary
198+
)
199+
200+
def _make_child(self, schema, array_view):
201+
return type(self)(schema, array_view=array_view)
202+
203+
@cached_property
204+
def _child_names(self):
205+
return [child.name for child in self._schema.children]
206+
207+
def __iter__(self):
208+
"""Iterate over all elements in the current chunk"""
209+
return self._iter_chunk(0, len(self._array_view))
210+
202211
def _iter_chunk(self, offset, length):
212+
"""Iterate over all elements in a slice of the current chunk"""
203213
# Check for an extension type first since this isn't reflected by
204214
# self._schema_view.type_id. Currently we just return the storage
205215
# iterator with a warning for extension types.
@@ -480,16 +490,16 @@ class RowTupleIterator(PyIterator):
480490
Intended for internal use.
481491
"""
482492

483-
def __init__(self, schema, *, _array_view=None):
484-
super().__init__(schema, _array_view=_array_view)
493+
def __init__(self, schema, *, array_view=None):
494+
super().__init__(schema, array_view=array_view)
485495
if self._schema_view.type != "struct":
486496
raise TypeError(
487497
"RowTupleIterator can only iterate over struct arrays "
488498
f"(got '{self._schema_view.type}')"
489499
)
490500

491501
def _make_child(self, schema, array_view):
492-
return PyIterator(schema, _array_view=array_view)
502+
return PyIterator(schema, array_view=array_view)
493503

494504
def _iter_chunk(self, offset, length):
495505
return self._struct_tuple_iter(offset, length)

0 commit comments

Comments
 (0)