Skip to content

Commit 1cedf0a

Browse files
authored
Dict compute to use codes if values are too large (#2804)
Fix #2796
1 parent 991ba02 commit 1cedf0a

File tree

4 files changed

+63
-5
lines changed

4 files changed

+63
-5
lines changed

encodings/dict/benches/dict_compare.rs

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@ use std::str::from_utf8;
44

55
use vortex_array::accessor::ArrayAccessor;
66
use vortex_array::arrays::{ConstantArray, VarBinArray, VarBinViewArray};
7-
use vortex_array::compute::{Operator, compare};
7+
use vortex_array::compute::{Operator, compare, slice};
88
use vortex_dict::builders::dict_encode;
99
use vortex_dict::test::{gen_primitive_for_dict, gen_varbin_words};
1010

1111
fn main() {
1212
divan::main();
1313
}
1414

15-
const BENCH_ARGS: &[(usize, usize)] = &[
15+
const LENGTH_AND_UNIQUE_VALUES: &[(usize, usize)] = &[
1616
// length, unique_values
1717
(10_000, 2),
1818
(10_000, 4),
@@ -30,7 +30,7 @@ const BENCH_ARGS: &[(usize, usize)] = &[
3030
(100_000, 2048),
3131
];
3232

33-
#[divan::bench(args = BENCH_ARGS)]
33+
#[divan::bench(args = LENGTH_AND_UNIQUE_VALUES)]
3434
fn bench_compare_primitive(bencher: divan::Bencher, (len, uniqueness): (usize, usize)) {
3535
let primitive_arr = gen_primitive_for_dict::<i32>(len, uniqueness);
3636
let dict = dict_encode(&primitive_arr).unwrap();
@@ -41,7 +41,7 @@ fn bench_compare_primitive(bencher: divan::Bencher, (len, uniqueness): (usize, u
4141
.bench_refs(|dict| compare(dict, &ConstantArray::new(value, len), Operator::Eq).unwrap())
4242
}
4343

44-
#[divan::bench(args = BENCH_ARGS)]
44+
#[divan::bench(args = LENGTH_AND_UNIQUE_VALUES)]
4545
fn bench_compare_varbin(bencher: divan::Bencher, (len, uniqueness): (usize, usize)) {
4646
let varbin_arr = VarBinArray::from(gen_varbin_words(len, uniqueness));
4747
let dict = dict_encode(&varbin_arr).unwrap();
@@ -55,7 +55,7 @@ fn bench_compare_varbin(bencher: divan::Bencher, (len, uniqueness): (usize, usiz
5555
.bench_refs(|dict| compare(dict, &ConstantArray::new(value, len), Operator::Eq).unwrap())
5656
}
5757

58-
#[divan::bench(args = BENCH_ARGS)]
58+
#[divan::bench(args = LENGTH_AND_UNIQUE_VALUES)]
5959
fn bench_compare_varbinview(bencher: divan::Bencher, (len, uniqueness): (usize, usize)) {
6060
let varbinview_arr = VarBinViewArray::from_iter_str(gen_varbin_words(len, uniqueness));
6161
let dict = dict_encode(&varbinview_arr).unwrap();
@@ -67,3 +67,48 @@ fn bench_compare_varbinview(bencher: divan::Bencher, (len, uniqueness): (usize,
6767
.with_inputs(|| dict.clone())
6868
.bench_refs(|dict| compare(dict, &ConstantArray::new(value, len), Operator::Eq).unwrap())
6969
}
70+
71+
const CODES_AND_VALUES_LENGTHS: &[(usize, usize)] = &[
72+
(1_000, 10_000),
73+
(2_000, 10_000),
74+
(2_500, 10_000),
75+
(3_333, 10_000),
76+
(5_000, 10_000),
77+
(7_500, 10_000),
78+
(9_999, 10_000),
79+
(10_000, 10_000),
80+
(20_000, 10_000),
81+
];
82+
83+
#[divan::bench(args = CODES_AND_VALUES_LENGTHS)]
84+
fn bench_compare_sliced_dict_primitive(
85+
bencher: divan::Bencher,
86+
(codes_len, values_len): (usize, usize),
87+
) {
88+
let primitive_arr = gen_primitive_for_dict::<i32>(codes_len.max(values_len), values_len);
89+
let dict = dict_encode(&primitive_arr).unwrap();
90+
let dict = slice(&dict, 0, codes_len).unwrap();
91+
let value = primitive_arr.as_slice::<i32>()[0];
92+
93+
bencher.with_inputs(|| dict.clone()).bench_refs(|dict| {
94+
compare(dict, &ConstantArray::new(value, codes_len), Operator::Eq).unwrap()
95+
})
96+
}
97+
98+
#[divan::bench(args = CODES_AND_VALUES_LENGTHS)]
99+
fn bench_compare_sliced_dict_varbinview(
100+
bencher: divan::Bencher,
101+
(codes_len, values_len): (usize, usize),
102+
) {
103+
let varbin_arr = VarBinArray::from(gen_varbin_words(codes_len.max(values_len), values_len));
104+
let dict = dict_encode(&varbin_arr).unwrap();
105+
let dict = slice(&dict, 0, codes_len).unwrap();
106+
let bytes = varbin_arr
107+
.with_iterator(|i| i.next().unwrap().unwrap().to_vec())
108+
.unwrap();
109+
let value = from_utf8(bytes.as_slice()).unwrap();
110+
111+
bencher.with_inputs(|| dict.clone()).bench_refs(|dict| {
112+
compare(dict, &ConstantArray::new(value, codes_len), Operator::Eq).unwrap()
113+
})
114+
}

encodings/dict/src/compute/binary_numeric.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ impl BinaryNumericFn<&DictArray> for DictEncoding {
1313
rhs: &dyn Array,
1414
op: BinaryNumericOperator,
1515
) -> VortexResult<Option<ArrayRef>> {
16+
// if we have more values than codes, it is faster to canonicalise first.
17+
if array.values().len() > array.codes().len() {
18+
return Ok(None);
19+
}
20+
1621
let Some(rhs_scalar) = rhs.as_constant() else {
1722
return Ok(None);
1823
};

encodings/dict/src/compute/compare.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ impl CompareFn<&DictArray> for DictEncoding {
1717
rhs: &dyn Array,
1818
operator: Operator,
1919
) -> VortexResult<Option<ArrayRef>> {
20+
// if we have more values than codes, it is faster to canonicalise first.
21+
if lhs.values().len() > lhs.codes().len() {
22+
return Ok(None);
23+
}
2024
// If the RHS is constant, then we just need to compare against our encoded values.
2125
if let Some(rhs) = rhs.as_constant() {
2226
let compare_result = compare(

encodings/dict/src/compute/like.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ impl LikeFn<&DictArray> for DictEncoding {
1212
pattern: &dyn Array,
1313
options: LikeOptions,
1414
) -> VortexResult<Option<ArrayRef>> {
15+
// if we have more values than codes, it is faster to canonicalise first.
16+
if array.values().len() > array.codes().len() {
17+
return Ok(None);
18+
}
1519
if let Some(pattern) = pattern.as_constant() {
1620
let pattern = ConstantArray::new(pattern, array.values().len()).into_array();
1721
let values = like(array.values(), &pattern, options)?;

0 commit comments

Comments
 (0)