Skip to content

Commit 825f6dc

Browse files
Add lists to vortex (#1524)
I add a simple implementation of lists based off varbin. I hope to unify parts of both of these.
1 parent 4855ff2 commit 825f6dc

File tree

11 files changed

+483
-21
lines changed

11 files changed

+483
-21
lines changed

vortex-array/src/array/arbitrary.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Resu
8181
.vortex_unwrap()
8282
.into_array())
8383
}
84+
// TOOD(joe): add arbitrary list
8485
DType::List(..) => {
8586
todo!("List arrays are not implemented")
8687
}

vortex-array/src/array/chunked/canonical.rs

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
use arrow_buffer::{BooleanBufferBuilder, Buffer, MutableBuffer, ScalarBuffer};
2-
use vortex_dtype::{DType, PType, StructDType};
2+
use vortex_dtype::{DType, Nullability, PType, StructDType};
33
use vortex_error::{vortex_bail, vortex_err, ErrString, VortexExpect, VortexResult};
44

55
use crate::array::chunked::ChunkedArray;
66
use crate::array::extension::ExtensionArray;
77
use crate::array::null::NullArray;
88
use crate::array::primitive::PrimitiveArray;
99
use crate::array::struct_::StructArray;
10-
use crate::array::{BinaryView, BoolArray, VarBinViewArray};
10+
use crate::array::{BinaryView, BoolArray, ListArray, VarBinViewArray};
11+
use crate::compute::{scalar_at, slice, try_cast};
1112
use crate::validity::Validity;
1213
use crate::{
13-
ArrayDType, ArrayData, ArrayValidity, Canonical, IntoArrayData, IntoArrayVariant, IntoCanonical,
14+
ArrayDType, ArrayData, ArrayLen, ArrayValidity, Canonical, IntoArrayData, IntoArrayVariant,
15+
IntoCanonical,
1416
};
1517

1618
impl IntoCanonical for ChunkedArray {
@@ -88,9 +90,11 @@ pub(crate) fn try_canonicalize_chunks(
8890
)))
8991
}
9092

91-
// TODO(aduffy): better list support
9293
DType::List(..) => {
93-
todo!()
94+
// TODO(joe): improve performance, use a listview, once it exists
95+
96+
let list = pack_lists(chunks.as_slice(), validity, dtype)?;
97+
Ok(Canonical::List(list))
9498
}
9599

96100
DType::Bool(_) => {
@@ -117,6 +121,50 @@ pub(crate) fn try_canonicalize_chunks(
117121
}
118122
}
119123

124+
fn pack_lists(chunks: &[ArrayData], validity: Validity, dtype: &DType) -> VortexResult<ListArray> {
125+
let len: usize = chunks.iter().map(|c| c.len()).sum();
126+
let mut offsets = Vec::with_capacity(len + 1);
127+
offsets.push(0);
128+
let mut elements = Vec::new();
129+
let elem_dtype = dtype
130+
.as_list_element()
131+
.vortex_expect("ListArray must have List dtype");
132+
133+
for chunk in chunks {
134+
let chunk = chunk.clone().into_list()?;
135+
// TODO: handle i32 offsets if they fit.
136+
let offsets_arr = try_cast(
137+
chunk.offsets(),
138+
&DType::Primitive(PType::I64, Nullability::NonNullable),
139+
)?
140+
.into_primitive()?;
141+
142+
let first_offset_value: usize = usize::try_from(&scalar_at(offsets_arr.as_ref(), 0)?)?;
143+
let last_offset_value: usize =
144+
usize::try_from(&scalar_at(offsets_arr.as_ref(), offsets_arr.len() - 1)?)?;
145+
elements.push(slice(
146+
chunk.elements(),
147+
first_offset_value,
148+
last_offset_value,
149+
)?);
150+
151+
let adjustment_from_previous = *offsets
152+
.last()
153+
.ok_or_else(|| vortex_err!("List offsets must have at least one element"))?;
154+
offsets.extend(
155+
offsets_arr
156+
.maybe_null_slice::<i64>()
157+
.iter()
158+
.skip(1)
159+
.map(|off| off + adjustment_from_previous - first_offset_value as i64),
160+
);
161+
}
162+
let chunked_elements = ChunkedArray::try_new(elements, elem_dtype.clone())?.into_array();
163+
let offsets = PrimitiveArray::from_vec(offsets, Validity::NonNullable);
164+
165+
ListArray::try_new(chunked_elements, offsets.into_array(), validity)
166+
}
167+
120168
/// Swizzle the pointers within a ChunkedArray of StructArrays to instead be a single
121169
/// StructArray, where the Array for each Field is a ChunkedArray.
122170
///
@@ -238,12 +286,17 @@ fn pack_views(
238286

239287
#[cfg(test)]
240288
mod tests {
241-
use vortex_dtype::{DType, Nullability};
289+
use std::sync::Arc;
290+
291+
use vortex_dtype::DType;
292+
use vortex_dtype::DType::{List, Primitive};
293+
use vortex_dtype::Nullability::NonNullable;
294+
use vortex_dtype::PType::I32;
242295

243296
use crate::accessor::ArrayAccessor;
244297
use crate::array::chunked::canonical::pack_views;
245-
use crate::array::{ChunkedArray, StructArray, VarBinViewArray};
246-
use crate::compute::slice;
298+
use crate::array::{ChunkedArray, ListArray, StructArray, VarBinViewArray};
299+
use crate::compute::{scalar_at, slice};
247300
use crate::validity::Validity;
248301
use crate::variants::StructArrayTrait;
249302
use crate::{ArrayDType, ArrayLen, IntoArrayData, IntoArrayVariant, ToArrayData};
@@ -258,7 +311,7 @@ mod tests {
258311
let array2 = slice(stringview_array().as_ref(), 2, 4).unwrap();
259312
let packed = pack_views(
260313
&[array1, array2],
261-
&DType::Utf8(Nullability::NonNullable),
314+
&DType::Utf8(NonNullable),
262315
Validity::NonNullable,
263316
)
264317
.unwrap();
@@ -308,4 +361,37 @@ mod tests {
308361
.unwrap();
309362
assert_eq!(orig_values, canon_values);
310363
}
364+
365+
#[test]
366+
pub fn pack_nested_lists() {
367+
let l1 = ListArray::try_new(
368+
vec![1, 2, 3, 4].into_array(),
369+
vec![0, 3].into_array(),
370+
Validity::NonNullable,
371+
)
372+
.unwrap();
373+
374+
let l2 = ListArray::try_new(
375+
vec![5, 6].into_array(),
376+
vec![0, 2].into_array(),
377+
Validity::NonNullable,
378+
)
379+
.unwrap();
380+
381+
let chunked_list = ChunkedArray::try_new(
382+
vec![l1.clone().into_array(), l2.clone().into_array()],
383+
List(Arc::new(Primitive(I32, NonNullable)), NonNullable),
384+
);
385+
386+
let canon_values = chunked_list.unwrap().into_list().unwrap();
387+
388+
assert_eq!(
389+
scalar_at(l1, 0).unwrap(),
390+
scalar_at(canon_values.clone(), 0).unwrap()
391+
);
392+
assert_eq!(
393+
scalar_at(l2, 0).unwrap(),
394+
scalar_at(canon_values, 1).unwrap()
395+
);
396+
}
311397
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
use std::sync::Arc;
2+
3+
use itertools::Itertools;
4+
use vortex_error::VortexResult;
5+
use vortex_scalar::Scalar;
6+
7+
use crate::array::{ListArray, ListEncoding};
8+
use crate::compute::{scalar_at, slice, ComputeVTable, ScalarAtFn, SliceFn};
9+
use crate::{ArrayDType, ArrayData, IntoArrayData};
10+
11+
impl ComputeVTable for ListEncoding {
12+
fn scalar_at_fn(&self) -> Option<&dyn ScalarAtFn<ArrayData>> {
13+
Some(self)
14+
}
15+
16+
fn slice_fn(&self) -> Option<&dyn SliceFn<ArrayData>> {
17+
Some(self)
18+
}
19+
}
20+
21+
impl ScalarAtFn<ListArray> for ListEncoding {
22+
fn scalar_at(&self, array: &ListArray, index: usize) -> VortexResult<Scalar> {
23+
let elem = array.elements_at(index)?;
24+
let scalars: Vec<Scalar> = (0..elem.len()).map(|i| scalar_at(&elem, i)).try_collect()?;
25+
26+
Ok(Scalar::list(Arc::new(elem.dtype().clone()), scalars))
27+
}
28+
}
29+
30+
impl SliceFn<ListArray> for ListEncoding {
31+
fn slice(&self, array: &ListArray, start: usize, stop: usize) -> VortexResult<ArrayData> {
32+
Ok(ListArray::try_new(
33+
array.elements(),
34+
slice(array.offsets(), start, stop + 1)?,
35+
array.validity().slice(start, stop)?,
36+
)?
37+
.into_array())
38+
}
39+
}

0 commit comments

Comments
 (0)