Skip to content

Commit a497074

Browse files
authored
bench: fix actually generate a lot of unique values in benchmark table (#17967)
* fix: actually generate a lot of unique values in benchmark table also added benchmark for testing pure grouping performance for more than 1 column. ---- I run this query for the data: ``` SELECT COUNT(*) AS total_count, COUNT(DISTINCT u64_wide) AS unique_count, COUNT(DISTINCT u64_wide) * 1.0 / COUNT(*) AS cardinality FROM t; ``` Before: ``` | total_count | unique_count | cardinality | | ----------- | ------------ | ----------- | | 65536 | 2048 | 0.03125 | ``` After: ``` | total_count | unique_count | cardinality | | ----------- | ------------ | ----------- | | 65536 | 65536 | 1.0 | ``` * format * added multi group by benchmark on primitive only columns
1 parent 590ad29 commit a497074

File tree

2 files changed

+38
-5
lines changed

2 files changed

+38
-5
lines changed

datafusion/core/benches/aggregate_query_sql.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,38 @@ fn criterion_benchmark(c: &mut Criterion) {
153153
})
154154
});
155155

156+
c.bench_function(
157+
"aggregate_query_group_by_wide_u64_and_string_without_aggregate_expressions",
158+
|b| {
159+
b.iter(|| {
160+
query(
161+
ctx.clone(),
162+
&rt,
163+
// Due to the large number of distinct values in u64_wide,
164+
// this query test the actual grouping performance for more than 1 column
165+
"SELECT u64_wide, utf8 \
166+
FROM t GROUP BY u64_wide, utf8",
167+
)
168+
})
169+
},
170+
);
171+
172+
c.bench_function(
173+
"aggregate_query_group_by_wide_u64_and_f32_without_aggregate_expressions",
174+
|b| {
175+
b.iter(|| {
176+
query(
177+
ctx.clone(),
178+
&rt,
179+
// Due to the large number of distinct values in u64_wide,
180+
// this query test the actual grouping performance for more than 1 column
181+
"SELECT u64_wide, f32 \
182+
FROM t GROUP BY u64_wide, f32",
183+
)
184+
})
185+
},
186+
);
187+
156188
c.bench_function("aggregate_query_approx_percentile_cont_on_u64", |b| {
157189
b.iter(|| {
158190
query(

datafusion/core/benches/data_utils/mod.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,11 @@ fn create_data(size: usize, null_density: f64) -> Vec<Option<f64>> {
8181
.collect()
8282
}
8383

84-
fn create_integer_data(size: usize, value_density: f64) -> Vec<Option<u64>> {
85-
// use random numbers to avoid spurious compiler optimizations wrt to branching
86-
let mut rng = StdRng::seed_from_u64(42);
87-
84+
fn create_integer_data(
85+
rng: &mut StdRng,
86+
size: usize,
87+
value_density: f64,
88+
) -> Vec<Option<u64>> {
8889
(0..size)
8990
.map(|_| {
9091
if rng.random::<f64>() > value_density {
@@ -116,7 +117,7 @@ fn create_record_batch(
116117
let values = create_data(batch_size, 0.5);
117118

118119
// Integer values between [0, u64::MAX].
119-
let integer_values_wide = create_integer_data(batch_size, 9.0);
120+
let integer_values_wide = create_integer_data(rng, batch_size, 9.0);
120121

121122
// Integer values between [0, 9].
122123
let integer_values_narrow = (0..batch_size)

0 commit comments

Comments
 (0)