Skip to content

Commit c05480e

Browse files
committed
fix[vortex-dict]: avoid full materialization in min_max
Prior to this commit, min_max on dictionaries would materialize the full dictionary. This commit iterates over the codes and checks whether they fully cover the values slice, in which case min_max can be run on the values slice. Even if the codes do not fully cover the values, we can materialize without duplicates, by filtering the values slice to only the covered indices. Signed-off-by: Alfonso Subiotto Marques <[email protected]>
1 parent 32d0834 commit c05480e

File tree

3 files changed

+125
-7
lines changed

3 files changed

+125
-7
lines changed

Cargo.lock

Lines changed: 0 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

encodings/dict/Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ arrow = ["dep:arrow-array"]
2020

2121
[dependencies]
2222
arrow-array = { workspace = true, optional = true }
23-
arrow-buffer = { workspace = true }
24-
num-traits = { workspace = true }
2523
prost = { workspace = true }
2624
# test-harness
2725
rand = { workspace = true, optional = true }
Lines changed: 125 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,138 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
use vortex_array::compute::{MinMaxKernel, MinMaxKernelAdapter, MinMaxResult, min_max, take};
5-
use vortex_array::register_kernel;
4+
use vortex_array::compute::{MinMaxKernel, MinMaxKernelAdapter, MinMaxResult, mask, min_max};
5+
use vortex_array::{Array as _, ToCanonical, register_kernel};
6+
use vortex_buffer::BitBuffer;
7+
use vortex_dtype::match_each_unsigned_integer_ptype;
68
use vortex_error::VortexResult;
9+
use vortex_mask::Mask;
710

811
use crate::{DictArray, DictVTable};
912

1013
impl MinMaxKernel for DictVTable {
1114
fn min_max(&self, array: &DictArray) -> VortexResult<Option<MinMaxResult>> {
12-
min_max(&take(array.values(), array.codes())?)
15+
let codes_validity = array.codes().validity_mask();
16+
if codes_validity.all_false() {
17+
return Ok(None);
18+
}
19+
20+
let codes_primitive = array.codes().to_primitive();
21+
let values_len = array.values().len();
22+
match_each_unsigned_integer_ptype!(codes_primitive.ptype(), |P| {
23+
codes_validity.iter_bools(|validity_iter| {
24+
// mask() sets values to null where the mask is true, so we start
25+
// with a fully-set bool buffer.
26+
let mut unreferenced = vec![true; values_len];
27+
#[allow(clippy::cast_possible_truncation)]
28+
for (&code, is_valid) in codes_primitive.as_slice::<P>().iter().zip(validity_iter) {
29+
if is_valid {
30+
unreferenced[code as usize] = false;
31+
}
32+
}
33+
34+
let unreferenced_mask =
35+
Mask::from_buffer(BitBuffer::collect_bool(values_len, |i| unreferenced[i]));
36+
min_max(&mask(array.values(), &unreferenced_mask)?)
37+
})
38+
})
1339
}
1440
}
1541

1642
register_kernel!(MinMaxKernelAdapter(DictVTable).lift());
43+
44+
#[cfg(test)]
45+
mod tests {
46+
use rstest::rstest;
47+
use vortex_array::arrays::PrimitiveArray;
48+
use vortex_array::compute::min_max;
49+
use vortex_array::{Array, IntoArray};
50+
use vortex_buffer::buffer;
51+
52+
use crate::DictArray;
53+
use crate::builders::dict_encode;
54+
55+
fn assert_min_max(array: &dyn Array, expected: Option<(i32, i32)>) {
56+
match (min_max(array).unwrap(), expected) {
57+
(Some(result), Some((expected_min, expected_max))) => {
58+
assert_eq!(i32::try_from(result.min).unwrap(), expected_min);
59+
assert_eq!(i32::try_from(result.max).unwrap(), expected_max);
60+
}
61+
(None, None) => {}
62+
(got, expected) => panic!(
63+
"min_max mismatch: expected {:?}, got {:?}",
64+
expected,
65+
got.as_ref().map(|r| (
66+
i32::try_from(r.min.clone()).ok(),
67+
i32::try_from(r.max.clone()).ok()
68+
))
69+
),
70+
}
71+
}
72+
73+
#[rstest]
74+
#[case::covering(
75+
DictArray::try_new(
76+
buffer![0u32, 1, 2, 3, 0, 1].into_array(),
77+
buffer![10i32, 20, 30, 40].into_array(),
78+
).unwrap(),
79+
(10, 40)
80+
)]
81+
#[case::non_covering_duplicates(
82+
DictArray::try_new(
83+
buffer![1u32, 1, 1, 3, 3].into_array(),
84+
buffer![1i32, 2, 3, 4, 5].into_array(),
85+
).unwrap(),
86+
(2, 4)
87+
)]
88+
// Non-covering: codes with gaps
89+
#[case::non_covering_gaps(
90+
DictArray::try_new(
91+
buffer![0u32, 2, 4].into_array(),
92+
buffer![1i32, 2, 3, 4, 5].into_array(),
93+
).unwrap(),
94+
(1, 5)
95+
)]
96+
#[case::single(dict_encode(&buffer![42i32].into_array()).unwrap(), (42, 42))]
97+
#[case::nullable_codes(
98+
DictArray::try_new(
99+
PrimitiveArray::from_option_iter([Some(0u32), None, Some(1), Some(2)]).into_array(),
100+
buffer![10i32, 20, 30].into_array(),
101+
).unwrap(),
102+
(10, 30)
103+
)]
104+
#[case::nullable_values(
105+
dict_encode(
106+
PrimitiveArray::from_option_iter([Some(1i32), None, Some(2), Some(1), None]).as_ref()
107+
).unwrap(),
108+
(1, 2)
109+
)]
110+
fn test_min_max(#[case] dict: DictArray, #[case] expected: (i32, i32)) {
111+
assert_min_max(dict.as_ref(), Some(expected));
112+
}
113+
114+
#[test]
115+
fn test_sliced_dict() {
116+
let reference = PrimitiveArray::from_iter([1, 5, 10, 50, 100]);
117+
let dict = dict_encode(reference.as_ref()).unwrap();
118+
let sliced = dict.slice(1..3);
119+
assert_min_max(&sliced, Some((5, 10)));
120+
}
121+
122+
#[rstest]
123+
#[case::empty(
124+
DictArray::try_new(
125+
PrimitiveArray::from_iter(Vec::<u32>::new()).into_array(),
126+
buffer![10i32, 20, 30].into_array(),
127+
).unwrap()
128+
)]
129+
#[case::all_null_codes(
130+
DictArray::try_new(
131+
PrimitiveArray::from_option_iter([Option::<u32>::None, None, None]).into_array(),
132+
buffer![10i32, 20, 30].into_array(),
133+
).unwrap()
134+
)]
135+
fn test_min_max_none(#[case] dict: DictArray) {
136+
assert_min_max(dict.as_ref(), None);
137+
}
138+
}

0 commit comments

Comments
 (0)