Skip to content

Commit b329570

Browse files
committed
feat: teach utf8-type SparseArray to canonicalize
Signed-off-by: Daniel King <[email protected]>
1 parent 4aa8940 commit b329570

File tree

1 file changed

+218
-9
lines changed

1 file changed

+218
-9
lines changed

encodings/sparse/src/canonical.rs

Lines changed: 218 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,25 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use itertools::Itertools;
5+
use num_traits::NumCast;
56
use vortex_array::arrays::{
6-
BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray,
7-
smallest_storage_type,
7+
BinaryView, BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray,
8+
VarBinViewArray, smallest_storage_type,
89
};
910
use vortex_array::builders::{ArrayBuilder as _, DecimalBuilder};
1011
use vortex_array::patches::Patches;
1112
use vortex_array::validity::Validity;
1213
use vortex_array::vtable::CanonicalVTable;
13-
use vortex_array::{Array, Canonical};
14-
use vortex_buffer::buffer;
14+
use vortex_array::{Array, Canonical, ToCanonical as _};
15+
use vortex_buffer::{Buffer, BufferMut, buffer};
1516
use vortex_dtype::{
16-
DType, DecimalDType, NativePType, Nullability, StructFields, match_each_native_ptype,
17+
DType, DecimalDType, NativePType, Nullability, StructFields, match_each_integer_ptype,
18+
match_each_native_ptype,
1719
};
1820
use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_err};
1921
use vortex_scalar::{
20-
DecimalScalar, NativeDecimalType, Scalar, StructScalar, match_each_decimal_value_type,
22+
DecimalScalar, NativeDecimalType, Scalar, StructScalar, Utf8Scalar,
23+
match_each_decimal_value_type,
2124
};
2225

2326
use crate::{SparseArray, SparseVTable};
@@ -63,7 +66,21 @@ impl CanonicalVTable<SparseVTable> for SparseVTable {
6366
)
6467
})
6568
}
66-
DType::Utf8(_nullability) => todo!(),
69+
DType::Utf8(nullability) => {
70+
let patches = array.resolved_patches()?;
71+
let indices = patches.indices().to_primitive()?;
72+
let values = patches.values().to_varbinview()?;
73+
let fill_value = array.fill_scalar().as_utf8();
74+
let validity = array
75+
.validity_mask()
76+
.map(|x| Validity::from_mask(x, *nullability))?;
77+
let len = array.len();
78+
79+
match_each_integer_ptype!(indices.ptype(), |I| {
80+
let indices = indices.buffer::<I>();
81+
canonicalize_utf8::<I>(fill_value, indices, values, validity, len)
82+
})
83+
}
6784
DType::Binary(_nullability) => todo!(),
6885
DType::List(_dtype, _nullability) => todo!(),
6986
DType::Extension(_ext_dtype) => todo!(),
@@ -199,12 +216,59 @@ fn canonicalize_sparse_decimal<D: NativeDecimalType>(
199216
Ok(Canonical::Decimal(array))
200217
}
201218

219+
fn canonicalize_utf8<I: NativePType>(
220+
fill_value: Utf8Scalar,
221+
indices: Buffer<I>,
222+
values: VarBinViewArray,
223+
validity: Validity,
224+
len: usize,
225+
) -> VortexResult<Canonical> {
226+
let n_patch_buffers = values.buffers().len();
227+
let mut buffers = values.buffers().to_vec();
228+
229+
let fill = if let Some(buffer) = &fill_value.value() {
230+
buffers.push(buffer.inner().clone());
231+
BinaryView::make_view(
232+
buffer.as_ref(),
233+
u32::try_from(n_patch_buffers).vortex_expect("too many buffers"),
234+
0,
235+
)
236+
} else {
237+
// any <=12 character value will do
238+
BinaryView::make_view("FILL_VALUE".as_ref(), 0, 0)
239+
};
240+
241+
let mut view_builder = BufferMut::<BinaryView>::with_capacity(len);
242+
let mut i = 0usize;
243+
for (patch_index, &patch) in indices.into_iter().zip_eq(values.views().iter()) {
244+
let patch_index_usize = <usize as NumCast>::from(patch_index)
245+
.vortex_expect("var bin view indices must fit in usize");
246+
for _ in i..patch_index_usize {
247+
view_builder.push(fill)
248+
}
249+
view_builder.push(patch);
250+
i = patch_index_usize + 1;
251+
}
252+
for _ in i..len {
253+
view_builder.push(fill)
254+
}
255+
256+
let array = VarBinViewArray::try_new(
257+
view_builder.freeze(),
258+
buffers,
259+
DType::Utf8(validity.nullability()),
260+
validity,
261+
)?;
262+
263+
Ok(Canonical::VarBinView(array))
264+
}
265+
202266
#[cfg(test)]
203267
mod test {
204-
268+
use itertools::Itertools as _;
205269
use rstest::rstest;
206270
use vortex_array::arrays::{
207-
BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray,
271+
BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray, VarBinViewArray,
208272
};
209273
use vortex_array::arrow::IntoArrowArray as _;
210274
use vortex_array::validity::Validity;
@@ -511,4 +575,149 @@ mod test {
511575
assert_eq!(expected.data_type(), actual.data_type());
512576
assert_eq!(&expected, &actual);
513577
}
578+
579+
#[test]
580+
fn test_sparse_varbinview_non_null_fill() {
581+
let strings = <VarBinViewArray as FromIterator<_>>::from_iter([
582+
Some("hello"),
583+
Some("goodbye"),
584+
Some("hello"),
585+
None,
586+
Some("bonjour"),
587+
Some("你好"),
588+
None,
589+
])
590+
.into_array();
591+
592+
let array = SparseArray::try_new(
593+
buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(),
594+
strings,
595+
12,
596+
Scalar::from(Some("123".to_owned())),
597+
)
598+
.unwrap();
599+
600+
let actual = array.to_varbinview().unwrap().into_array();
601+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
602+
Some("hello"),
603+
Some("123"),
604+
Some("123"),
605+
Some("goodbye"),
606+
Some("hello"),
607+
None,
608+
Some("123"),
609+
Some("bonjour"),
610+
Some("123"),
611+
Some("你好"),
612+
None,
613+
Some("123"),
614+
])
615+
.into_array();
616+
617+
let actual = actual.into_arrow_preferred().unwrap();
618+
let expected = expected.into_arrow_preferred().unwrap();
619+
620+
assert_eq!(actual.data_type(), expected.data_type());
621+
assert_eq!(&actual, &expected);
622+
}
623+
624+
#[test]
625+
fn test_sparse_varbinview_null_fill() {
626+
let strings = <VarBinViewArray as FromIterator<_>>::from_iter([
627+
Some("hello"),
628+
Some("goodbye"),
629+
Some("hello"),
630+
None,
631+
Some("bonjour"),
632+
Some("你好"),
633+
None,
634+
])
635+
.into_array();
636+
637+
println!(
638+
"strings: {}",
639+
(0..strings.len())
640+
.map(|i| strings.scalar_at(i).unwrap())
641+
.join(",")
642+
);
643+
let va = strings.validity_mask().unwrap().into_array();
644+
println!(
645+
"strings_validity: {}",
646+
(0..va.len())
647+
.map(|i| va
648+
.scalar_at(i)
649+
.unwrap()
650+
.as_bool()
651+
.value()
652+
.map_or("N", |b| if b { "T" } else { "F" })
653+
.to_owned())
654+
.join(",")
655+
);
656+
657+
let array = SparseArray::try_new(
658+
buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(),
659+
strings,
660+
12,
661+
Scalar::null(DType::Utf8(Nullable)),
662+
)
663+
.unwrap();
664+
665+
let actual = array.to_varbinview().unwrap().into_array();
666+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
667+
Some("hello"),
668+
None,
669+
None,
670+
Some("goodbye"),
671+
Some("hello"),
672+
None,
673+
None,
674+
Some("bonjour"),
675+
None,
676+
Some("你好"),
677+
None,
678+
None,
679+
])
680+
.into_array();
681+
682+
let actual = actual.into_arrow_preferred().unwrap();
683+
let expected = expected.into_arrow_preferred().unwrap();
684+
685+
assert_eq!(actual.data_type(), expected.data_type());
686+
assert_eq!(&actual, &expected);
687+
}
688+
689+
#[test]
690+
fn test_sparse_varbinview_non_nullable() {
691+
let strings =
692+
VarBinViewArray::from_iter_str(["hello", "goodbye", "hello", "bonjour", "你好"])
693+
.into_array();
694+
695+
let array = SparseArray::try_new(
696+
buffer![0u16, 3, 4, 5, 8].into_array(),
697+
strings,
698+
9,
699+
Scalar::from("123".to_owned()),
700+
)
701+
.unwrap();
702+
703+
let actual = array.to_varbinview().unwrap().into_array();
704+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
705+
Some("hello"),
706+
Some("123"),
707+
Some("123"),
708+
Some("goodbye"),
709+
Some("hello"),
710+
Some("bonjour"),
711+
Some("123"),
712+
Some("123"),
713+
Some("你好"),
714+
])
715+
.into_array();
716+
717+
let actual = actual.into_arrow_preferred().unwrap();
718+
let expected = expected.into_arrow_preferred().unwrap();
719+
720+
assert_eq!(actual.data_type(), expected.data_type());
721+
assert_eq!(&actual, &expected);
722+
}
514723
}

0 commit comments

Comments
 (0)