Skip to content

Commit 82ae16c

Browse files
authored
feat: teach utf8-type SparseArray to canonicalize (#3848)
Waiting on #3846 --------- Signed-off-by: Daniel King <[email protected]>
1 parent 4aa8940 commit 82ae16c

File tree

1 file changed

+235
-9
lines changed

1 file changed

+235
-9
lines changed

encodings/sparse/src/canonical.rs

Lines changed: 235 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,25 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use itertools::Itertools;
5+
use num_traits::NumCast;
56
use vortex_array::arrays::{
6-
BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray,
7-
smallest_storage_type,
7+
BinaryView, BoolArray, BooleanBuffer, ConstantArray, NullArray, PrimitiveArray, StructArray,
8+
VarBinViewArray, smallest_storage_type,
89
};
910
use vortex_array::builders::{ArrayBuilder as _, DecimalBuilder};
1011
use vortex_array::patches::Patches;
1112
use vortex_array::validity::Validity;
1213
use vortex_array::vtable::CanonicalVTable;
13-
use vortex_array::{Array, Canonical};
14-
use vortex_buffer::buffer;
14+
use vortex_array::{Array, Canonical, ToCanonical as _};
15+
use vortex_buffer::{Buffer, buffer, buffer_mut};
1516
use vortex_dtype::{
16-
DType, DecimalDType, NativePType, Nullability, StructFields, match_each_native_ptype,
17+
DType, DecimalDType, NativePType, Nullability, StructFields, match_each_integer_ptype,
18+
match_each_native_ptype,
1719
};
1820
use vortex_error::{VortexError, VortexExpect as _, VortexResult, vortex_err};
1921
use vortex_scalar::{
20-
DecimalScalar, NativeDecimalType, Scalar, StructScalar, match_each_decimal_value_type,
22+
DecimalScalar, NativeDecimalType, Scalar, StructScalar, Utf8Scalar,
23+
match_each_decimal_value_type,
2124
};
2225

2326
use crate::{SparseArray, SparseVTable};
@@ -63,7 +66,21 @@ impl CanonicalVTable<SparseVTable> for SparseVTable {
6366
)
6467
})
6568
}
66-
DType::Utf8(_nullability) => todo!(),
69+
DType::Utf8(nullability) => {
70+
let patches = array.resolved_patches()?;
71+
let indices = patches.indices().to_primitive()?;
72+
let values = patches.values().to_varbinview()?;
73+
let fill_value = array.fill_scalar().as_utf8();
74+
let validity = array
75+
.validity_mask()
76+
.map(|x| Validity::from_mask(x, *nullability))?;
77+
let len = array.len();
78+
79+
match_each_integer_ptype!(indices.ptype(), |I| {
80+
let indices = indices.buffer::<I>();
81+
canonicalize_utf8::<I>(fill_value, indices, values, validity, len)
82+
})
83+
}
6784
DType::Binary(_nullability) => todo!(),
6885
DType::List(_dtype, _nullability) => todo!(),
6986
DType::Extension(_ext_dtype) => todo!(),
@@ -199,12 +216,51 @@ fn canonicalize_sparse_decimal<D: NativeDecimalType>(
199216
Ok(Canonical::Decimal(array))
200217
}
201218

219+
fn canonicalize_utf8<I: NativePType>(
220+
fill_value: Utf8Scalar,
221+
indices: Buffer<I>,
222+
values: VarBinViewArray,
223+
validity: Validity,
224+
len: usize,
225+
) -> VortexResult<Canonical> {
226+
let n_patch_buffers = values.buffers().len();
227+
let mut buffers = values.buffers().to_vec();
228+
229+
let fill = if let Some(buffer) = &fill_value.value() {
230+
buffers.push(buffer.inner().clone());
231+
BinaryView::make_view(
232+
buffer.as_ref(),
233+
u32::try_from(n_patch_buffers).vortex_expect("too many buffers"),
234+
0,
235+
)
236+
} else {
237+
// any <=12 character value will do
238+
BinaryView::make_view(&[], 0, 0)
239+
};
240+
241+
let mut views = buffer_mut![fill; len];
242+
for (patch_index, &patch) in indices.into_iter().zip_eq(values.views().iter()) {
243+
let patch_index_usize = <usize as NumCast>::from(patch_index)
244+
.vortex_expect("var bin view indices must fit in usize");
245+
views[patch_index_usize] = patch;
246+
}
247+
248+
let array = VarBinViewArray::try_new(
249+
views.freeze(),
250+
buffers,
251+
DType::Utf8(validity.nullability()),
252+
validity,
253+
)?;
254+
255+
Ok(Canonical::VarBinView(array))
256+
}
257+
202258
#[cfg(test)]
203259
mod test {
204-
205260
use rstest::rstest;
206261
use vortex_array::arrays::{
207-
BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray,
262+
BoolArray, BooleanBufferBuilder, DecimalArray, PrimitiveArray, StructArray, VarBinArray,
263+
VarBinViewArray,
208264
};
209265
use vortex_array::arrow::IntoArrowArray as _;
210266
use vortex_array::validity::Validity;
@@ -511,4 +567,174 @@ mod test {
511567
assert_eq!(expected.data_type(), actual.data_type());
512568
assert_eq!(&expected, &actual);
513569
}
570+
571+
#[test]
572+
fn test_sparse_varbinview_non_null_fill() {
573+
let strings = <VarBinViewArray as FromIterator<_>>::from_iter([
574+
Some("hello"),
575+
Some("goodbye"),
576+
Some("hello"),
577+
None,
578+
Some("bonjour"),
579+
Some("你好"),
580+
None,
581+
])
582+
.into_array();
583+
584+
let array = SparseArray::try_new(
585+
buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(),
586+
strings,
587+
12,
588+
Scalar::from(Some("123".to_owned())),
589+
)
590+
.unwrap();
591+
592+
let actual = array.to_varbinview().unwrap().into_array();
593+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
594+
Some("hello"),
595+
Some("123"),
596+
Some("123"),
597+
Some("goodbye"),
598+
Some("hello"),
599+
None,
600+
Some("123"),
601+
Some("bonjour"),
602+
Some("123"),
603+
Some("你好"),
604+
None,
605+
Some("123"),
606+
])
607+
.into_array();
608+
609+
let actual = actual.into_arrow_preferred().unwrap();
610+
let expected = expected.into_arrow_preferred().unwrap();
611+
612+
assert_eq!(actual.data_type(), expected.data_type());
613+
assert_eq!(&actual, &expected);
614+
}
615+
616+
#[test]
617+
fn test_sparse_varbinview_null_fill() {
618+
let strings = <VarBinViewArray as FromIterator<_>>::from_iter([
619+
Some("hello"),
620+
Some("goodbye"),
621+
Some("hello"),
622+
None,
623+
Some("bonjour"),
624+
Some("你好"),
625+
None,
626+
])
627+
.into_array();
628+
629+
let array = SparseArray::try_new(
630+
buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(),
631+
strings,
632+
12,
633+
Scalar::null(DType::Utf8(Nullable)),
634+
)
635+
.unwrap();
636+
637+
let actual = array.to_varbinview().unwrap().into_array();
638+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
639+
Some("hello"),
640+
None,
641+
None,
642+
Some("goodbye"),
643+
Some("hello"),
644+
None,
645+
None,
646+
Some("bonjour"),
647+
None,
648+
Some("你好"),
649+
None,
650+
None,
651+
])
652+
.into_array();
653+
654+
let actual = actual.into_arrow_preferred().unwrap();
655+
let expected = expected.into_arrow_preferred().unwrap();
656+
657+
assert_eq!(actual.data_type(), expected.data_type());
658+
assert_eq!(&actual, &expected);
659+
}
660+
661+
#[test]
662+
fn test_sparse_varbinview_non_nullable() {
663+
let strings =
664+
VarBinViewArray::from_iter_str(["hello", "goodbye", "hello", "bonjour", "你好"])
665+
.into_array();
666+
667+
let array = SparseArray::try_new(
668+
buffer![0u16, 3, 4, 5, 8].into_array(),
669+
strings,
670+
9,
671+
Scalar::from("123".to_owned()),
672+
)
673+
.unwrap();
674+
675+
let actual = array.to_varbinview().unwrap().into_array();
676+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
677+
Some("hello"),
678+
Some("123"),
679+
Some("123"),
680+
Some("goodbye"),
681+
Some("hello"),
682+
Some("bonjour"),
683+
Some("123"),
684+
Some("123"),
685+
Some("你好"),
686+
])
687+
.into_array();
688+
689+
let actual = actual.into_arrow_preferred().unwrap();
690+
let expected = expected.into_arrow_preferred().unwrap();
691+
692+
assert_eq!(actual.data_type(), expected.data_type());
693+
assert_eq!(&actual, &expected);
694+
}
695+
696+
#[test]
697+
fn test_sparse_varbin_null_fill() {
698+
let strings = <VarBinArray as FromIterator<_>>::from_iter([
699+
Some("hello"),
700+
Some("goodbye"),
701+
Some("hello"),
702+
None,
703+
Some("bonjour"),
704+
Some("你好"),
705+
None,
706+
])
707+
.into_array();
708+
709+
let array = SparseArray::try_new(
710+
buffer![0u16, 3, 4, 5, 7, 9, 10].into_array(),
711+
strings,
712+
12,
713+
Scalar::null(DType::Utf8(Nullable)),
714+
)
715+
.unwrap();
716+
717+
let actual = array.to_varbinview().unwrap().into_array();
718+
let expected = <VarBinViewArray as FromIterator<_>>::from_iter([
719+
Some("hello"),
720+
None,
721+
None,
722+
Some("goodbye"),
723+
Some("hello"),
724+
None,
725+
None,
726+
Some("bonjour"),
727+
None,
728+
Some("你好"),
729+
None,
730+
None,
731+
])
732+
.into_array();
733+
734+
let actual = actual.into_arrow_preferred().unwrap();
735+
let expected = expected.into_arrow_preferred().unwrap();
736+
737+
assert_eq!(actual.data_type(), expected.data_type());
738+
assert_eq!(&actual, &expected);
739+
}
514740
}

0 commit comments

Comments
 (0)