Skip to content

Commit 3230048

Browse files
zhuqi-lucas2010YOUY01martin-g
authored
Add benchmark for array_has/array_has_all/array_has_any (#18729)
## Which issue does this PR close? Add benchmark for array_has/array_has_all/array_has_any before optimization. - Closes part of [#18727](#18727) ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Yongting You <[email protected]> Co-authored-by: Martin Grigorov <[email protected]>
1 parent 76b9e12 commit 3230048

File tree

2 files changed

+381
-0
lines changed

2 files changed

+381
-0
lines changed

datafusion/functions-nested/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ rand = { workspace = true }
6969
harness = false
7070
name = "array_expression"
7171

72+
[[bench]]
73+
harness = false
74+
name = "array_has"
75+
7276
[[bench]]
7377
harness = false
7478
name = "array_reverse"
Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#[macro_use]
19+
extern crate criterion;
20+
21+
use criterion::{BenchmarkId, Criterion};
22+
use datafusion_expr::lit;
23+
use datafusion_functions_nested::expr_fn::{
24+
array_has, array_has_all, array_has_any, make_array,
25+
};
26+
use std::hint::black_box;
27+
28+
// If not explicitly stated, `array` and `array_size` refer to the haystack array.
29+
fn criterion_benchmark(c: &mut Criterion) {
30+
// Test different array sizes
31+
let array_sizes = vec![1, 10, 100, 1000, 10000];
32+
33+
for &size in &array_sizes {
34+
bench_array_has(c, size);
35+
bench_array_has_all(c, size);
36+
bench_array_has_any(c, size);
37+
}
38+
39+
// Specific benchmarks for string arrays (common use case)
40+
bench_array_has_strings(c);
41+
bench_array_has_all_strings(c);
42+
bench_array_has_any_strings(c);
43+
44+
// Edge cases
45+
bench_array_has_edge_cases(c);
46+
}
47+
48+
fn bench_array_has(c: &mut Criterion, array_size: usize) {
49+
let mut group = c.benchmark_group("array_has_i64");
50+
51+
// Benchmark: element found at beginning
52+
group.bench_with_input(
53+
BenchmarkId::new("found_at_start", array_size),
54+
&array_size,
55+
|b, &size| {
56+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
57+
let list_array = make_array(array);
58+
let needle = lit(0_i64);
59+
60+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
61+
},
62+
);
63+
64+
// Benchmark: element found at end
65+
group.bench_with_input(
66+
BenchmarkId::new("found_at_end", array_size),
67+
&array_size,
68+
|b, &size| {
69+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
70+
let list_array = make_array(array);
71+
let needle = lit((size - 1) as i64);
72+
73+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
74+
},
75+
);
76+
77+
// Benchmark: element not found
78+
group.bench_with_input(
79+
BenchmarkId::new("not_found", array_size),
80+
&array_size,
81+
|b, &size| {
82+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
83+
let list_array = make_array(array);
84+
let needle = lit(-1_i64); // Not in array
85+
86+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
87+
},
88+
);
89+
90+
group.finish();
91+
}
92+
93+
fn bench_array_has_all(c: &mut Criterion, array_size: usize) {
94+
let mut group = c.benchmark_group("array_has_all");
95+
96+
// Benchmark: all elements found (small needle)
97+
group.bench_with_input(
98+
BenchmarkId::new("all_found_small_needle", array_size),
99+
&array_size,
100+
|b, &size| {
101+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
102+
let list_array = make_array(array);
103+
let needle_array = make_array(vec![lit(0_i64), lit(1_i64), lit(2_i64)]);
104+
105+
b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
106+
},
107+
);
108+
109+
// Benchmark: all elements found (medium needle - 10% of haystack)
110+
group.bench_with_input(
111+
BenchmarkId::new("all_found_medium_needle", array_size),
112+
&array_size,
113+
|b, &size| {
114+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
115+
let list_array = make_array(array);
116+
let needle_size = (size / 10).max(1);
117+
let needle = (0..needle_size).map(|i| lit(i as i64)).collect::<Vec<_>>();
118+
let needle_array = make_array(needle);
119+
120+
b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
121+
},
122+
);
123+
124+
// Benchmark: not all found (early exit)
125+
group.bench_with_input(
126+
BenchmarkId::new("early_exit", array_size),
127+
&array_size,
128+
|b, &size| {
129+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
130+
let list_array = make_array(array);
131+
let needle_array = make_array(vec![lit(0_i64), lit(-1_i64)]); // -1 not in array
132+
133+
b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
134+
},
135+
);
136+
137+
group.finish();
138+
}
139+
140+
fn bench_array_has_any(c: &mut Criterion, array_size: usize) {
141+
let mut group = c.benchmark_group("array_has_any");
142+
143+
// Benchmark: first element matches (best case)
144+
group.bench_with_input(
145+
BenchmarkId::new("first_match", array_size),
146+
&array_size,
147+
|b, &size| {
148+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
149+
let list_array = make_array(array);
150+
let needle_array = make_array(vec![lit(0_i64), lit(-1_i64), lit(-2_i64)]);
151+
152+
b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
153+
},
154+
);
155+
156+
// Benchmark: last element matches (worst case)
157+
group.bench_with_input(
158+
BenchmarkId::new("last_match", array_size),
159+
&array_size,
160+
|b, &size| {
161+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
162+
let list_array = make_array(array);
163+
let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(0_i64)]);
164+
165+
b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
166+
},
167+
);
168+
169+
// Benchmark: no match
170+
group.bench_with_input(
171+
BenchmarkId::new("no_match", array_size),
172+
&array_size,
173+
|b, &size| {
174+
let array = (0..size).map(|i| lit(i as i64)).collect::<Vec<_>>();
175+
let list_array = make_array(array);
176+
let needle_array = make_array(vec![lit(-1_i64), lit(-2_i64), lit(-3_i64)]);
177+
178+
b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
179+
},
180+
);
181+
182+
group.finish();
183+
}
184+
185+
fn bench_array_has_strings(c: &mut Criterion) {
186+
let mut group = c.benchmark_group("array_has_strings");
187+
188+
// Benchmark with string arrays (common use case for tickers, tags, etc.)
189+
let sizes = vec![10, 100, 1000];
190+
191+
for &size in &sizes {
192+
group.bench_with_input(BenchmarkId::new("found", size), &size, |b, &size| {
193+
let array = (0..size)
194+
.map(|i| lit(format!("TICKER{i:04}")))
195+
.collect::<Vec<_>>();
196+
let list_array = make_array(array);
197+
let needle = lit("TICKER0005");
198+
199+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
200+
});
201+
202+
group.bench_with_input(BenchmarkId::new("not_found", size), &size, |b, &size| {
203+
let array = (0..size)
204+
.map(|i| lit(format!("TICKER{i:04}")))
205+
.collect::<Vec<_>>();
206+
let list_array = make_array(array);
207+
let needle = lit("NOTFOUND");
208+
209+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
210+
});
211+
}
212+
213+
group.finish();
214+
}
215+
216+
fn bench_array_has_all_strings(c: &mut Criterion) {
217+
let mut group = c.benchmark_group("array_has_all_strings");
218+
219+
// Realistic scenario: checking if a portfolio contains certain tickers
220+
let portfolio_size = 100;
221+
let check_sizes = vec![1, 3, 5, 10];
222+
223+
for &check_size in &check_sizes {
224+
group.bench_with_input(
225+
BenchmarkId::new("all_found", check_size),
226+
&check_size,
227+
|b, &check_size| {
228+
let portfolio = (0..portfolio_size)
229+
.map(|i| lit(format!("TICKER{i:04}")))
230+
.collect::<Vec<_>>();
231+
let list_array = make_array(portfolio);
232+
233+
let checking = (0..check_size)
234+
.map(|i| lit(format!("TICKER{i:04}")))
235+
.collect::<Vec<_>>();
236+
let needle_array = make_array(checking);
237+
238+
b.iter(|| {
239+
black_box(array_has_all(list_array.clone(), needle_array.clone()))
240+
})
241+
},
242+
);
243+
244+
group.bench_with_input(
245+
BenchmarkId::new("some_missing", check_size),
246+
&check_size,
247+
|b, &check_size| {
248+
let portfolio = (0..portfolio_size)
249+
.map(|i| lit(format!("TICKER{i:04}")))
250+
.collect::<Vec<_>>();
251+
let list_array = make_array(portfolio);
252+
253+
let mut checking = (0..check_size - 1)
254+
.map(|i| lit(format!("TICKER{i:04}")))
255+
.collect::<Vec<_>>();
256+
checking.push(lit("NOTFOUND".to_string()));
257+
let needle_array = make_array(checking);
258+
259+
b.iter(|| {
260+
black_box(array_has_all(list_array.clone(), needle_array.clone()))
261+
})
262+
},
263+
);
264+
}
265+
266+
group.finish();
267+
}
268+
269+
fn bench_array_has_any_strings(c: &mut Criterion) {
270+
let mut group = c.benchmark_group("array_has_any_strings");
271+
272+
let portfolio_size = 100;
273+
let check_sizes = vec![1, 3, 5, 10];
274+
275+
for &check_size in &check_sizes {
276+
group.bench_with_input(
277+
BenchmarkId::new("first_matches", check_size),
278+
&check_size,
279+
|b, &check_size| {
280+
let portfolio = (0..portfolio_size)
281+
.map(|i| lit(format!("TICKER{i:04}")))
282+
.collect::<Vec<_>>();
283+
let list_array = make_array(portfolio);
284+
285+
let mut checking = vec![lit("TICKER0000".to_string())];
286+
checking.extend((1..check_size).map(|_| lit("NOTFOUND".to_string())));
287+
let needle_array = make_array(checking);
288+
289+
b.iter(|| {
290+
black_box(array_has_any(list_array.clone(), needle_array.clone()))
291+
})
292+
},
293+
);
294+
295+
group.bench_with_input(
296+
BenchmarkId::new("none_match", check_size),
297+
&check_size,
298+
|b, &check_size| {
299+
let portfolio = (0..portfolio_size)
300+
.map(|i| lit(format!("TICKER{i:04}")))
301+
.collect::<Vec<_>>();
302+
let list_array = make_array(portfolio);
303+
304+
let checking = (0..check_size)
305+
.map(|i| lit(format!("NOTFOUND{i}")))
306+
.collect::<Vec<_>>();
307+
let needle_array = make_array(checking);
308+
309+
b.iter(|| {
310+
black_box(array_has_any(list_array.clone(), needle_array.clone()))
311+
})
312+
},
313+
);
314+
}
315+
316+
group.finish();
317+
}
318+
319+
fn bench_array_has_edge_cases(c: &mut Criterion) {
320+
let mut group = c.benchmark_group("array_has_edge_cases");
321+
322+
// Empty array
323+
group.bench_function("empty_array", |b| {
324+
let list_array = make_array(vec![]);
325+
let needle = lit(1_i64);
326+
327+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
328+
});
329+
330+
// Single element array - found
331+
group.bench_function("single_element_found", |b| {
332+
let list_array = make_array(vec![lit(1_i64)]);
333+
let needle = lit(1_i64);
334+
335+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
336+
});
337+
338+
// Single element array - not found
339+
group.bench_function("single_element_not_found", |b| {
340+
let list_array = make_array(vec![lit(1_i64)]);
341+
let needle = lit(2_i64);
342+
343+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
344+
});
345+
346+
// Array with duplicates
347+
group.bench_function("array_with_duplicates", |b| {
348+
let array = vec![lit(1_i64); 1000];
349+
let list_array = make_array(array);
350+
let needle = lit(1_i64);
351+
352+
b.iter(|| black_box(array_has(list_array.clone(), needle.clone())))
353+
});
354+
355+
// array_has_all: empty needle
356+
group.bench_function("array_has_all_empty_needle", |b| {
357+
let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>();
358+
let list_array = make_array(array);
359+
let needle_array = make_array(vec![]);
360+
361+
b.iter(|| black_box(array_has_all(list_array.clone(), needle_array.clone())))
362+
});
363+
364+
// array_has_any: empty needle
365+
group.bench_function("array_has_any_empty_needle", |b| {
366+
let array = (0..1000).map(|i| lit(i as i64)).collect::<Vec<_>>();
367+
let list_array = make_array(array);
368+
let needle_array = make_array(vec![]);
369+
370+
b.iter(|| black_box(array_has_any(list_array.clone(), needle_array.clone())))
371+
});
372+
373+
group.finish();
374+
}
375+
376+
criterion_group!(benches, criterion_benchmark);
377+
criterion_main!(benches);

0 commit comments

Comments
 (0)