Skip to content

Commit 4e7bba4

Browse files
Extend in_list benchmark coverage (#19376)
## Which issue does this PR close? - Related to #19241 ## Rationale for this change This PR enhances the `in_list` benchmark suite to provide more comprehensive performance measurements across a wider range of data types and list sizes. These improvements are necessary groundwork for evaluating optimizations proposed in #19241. The current benchmarks were limited in scope, making it difficult to assess the performance impact of potential `in_list` optimizations across different data types and scenarios. ## What changes are included in this PR? - Added benchmarks for `UInt8Array`, `Int16Array`, and `TimestampNanosecondArray` - Added `28` to `IN_LIST_LENGTHS` (now `[3, 8, 28, 100]`) to better cover the range between small and large lists - Increased `ARRAY_LENGTH` from `1024` to `8192` to be aligned with the default DataFusionbatch size - Configured criterion with shorter warm-up (100ms) and measurement times (500ms) for faster iteration ## Are these changes tested? Yes, this PR adds benchmark coverage. The benchmarks can be run with: ```bash cargo bench --bench in_list ``` The benchmarks verify that the `in_list` expression evaluates correctly for all the new data types. ## Are there any user-facing changes? No user-facing changes. This PR only affects the benchmark suite used for performance testing and development.
1 parent 14cd71e commit 4e7bba4

File tree

1 file changed

+37
-5
lines changed

1 file changed

+37
-5
lines changed

datafusion/physical-expr/benches/in_list.rs

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
// under the License.
1717

1818
use arrow::array::{
19-
Array, ArrayRef, Float32Array, Int32Array, StringArray, StringViewArray,
19+
Array, ArrayRef, Float32Array, Int16Array, Int32Array, StringArray, StringViewArray,
20+
TimestampNanosecondArray, UInt8Array,
2021
};
2122
use arrow::datatypes::{Field, Schema};
2223
use arrow::record_batch::RecordBatch;
@@ -28,6 +29,7 @@ use rand::prelude::*;
2829
use std::any::TypeId;
2930
use std::hint::black_box;
3031
use std::sync::Arc;
32+
use std::time::Duration;
3133

3234
/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
3335
fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
@@ -47,10 +49,10 @@ fn random_string(rng: &mut StdRng, len: usize) -> String {
4749
String::from_utf8(value).unwrap()
4850
}
4951

50-
const IN_LIST_LENGTHS: [usize; 3] = [3, 8, 100];
52+
const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
5153
const NULL_PERCENTS: [f64; 2] = [0., 0.2];
5254
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
53-
const ARRAY_LENGTH: usize = 1024;
55+
const ARRAY_LENGTH: usize = 8192;
5456

5557
/// Returns a friendly type name for the array type.
5658
fn array_type_name<A: 'static>() -> &'static str {
@@ -61,8 +63,14 @@ fn array_type_name<A: 'static>() -> &'static str {
6163
"Utf8View"
6264
} else if id == TypeId::of::<Float32Array>() {
6365
"Float32"
66+
} else if id == TypeId::of::<Int16Array>() {
67+
"Int16"
6468
} else if id == TypeId::of::<Int32Array>() {
6569
"Int32"
70+
} else if id == TypeId::of::<TimestampNanosecondArray>() {
71+
"TimestampNs"
72+
} else if id == TypeId::of::<UInt8Array>() {
73+
"UInt8"
6674
} else {
6775
"Unknown"
6876
}
@@ -142,7 +150,7 @@ fn bench_numeric_type<T, A>(
142150
}
143151
}
144152

145-
/// Entry point: registers in_list benchmarks for Utf8, Utf8View, Float32, and Int32 arrays.
153+
/// Entry point: registers in_list benchmarks for string and numeric array types.
146154
fn criterion_benchmark(c: &mut Criterion) {
147155
let mut rng = StdRng::seed_from_u64(120320);
148156

@@ -151,6 +159,18 @@ fn criterion_benchmark(c: &mut Criterion) {
151159
bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
152160

153161
// Benchmarks for numeric types
162+
bench_numeric_type::<u8, UInt8Array>(
163+
c,
164+
&mut rng,
165+
|rng| rng.random(),
166+
|v| ScalarValue::UInt8(Some(v)),
167+
);
168+
bench_numeric_type::<i16, Int16Array>(
169+
c,
170+
&mut rng,
171+
|rng| rng.random(),
172+
|v| ScalarValue::Int16(Some(v)),
173+
);
154174
bench_numeric_type::<f32, Float32Array>(
155175
c,
156176
&mut rng,
@@ -163,7 +183,19 @@ fn criterion_benchmark(c: &mut Criterion) {
163183
|rng| rng.random(),
164184
|v| ScalarValue::Int32(Some(v)),
165185
);
186+
bench_numeric_type::<i64, TimestampNanosecondArray>(
187+
c,
188+
&mut rng,
189+
|rng| rng.random(),
190+
|v| ScalarValue::TimestampNanosecond(Some(v), None),
191+
);
166192
}
167193

168-
criterion_group!(benches, criterion_benchmark);
194+
criterion_group! {
195+
name = benches;
196+
config = Criterion::default()
197+
.warm_up_time(Duration::from_millis(100))
198+
.measurement_time(Duration::from_millis(500));
199+
targets = criterion_benchmark
200+
}
169201
criterion_main!(benches);

0 commit comments

Comments
 (0)