Skip to content

Commit 5467a28

Browse files
authored
feat: Add boolean column to aggregate queries for fuzz testing (#13331)
* add bool col * clippy fix * remove change * fmt fix * typo fix
1 parent f894c7d commit 5467a28

File tree

4 files changed

+106
-5
lines changed

4 files changed

+106
-5
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ fn baseline_config() -> DatasetGeneratorConfig {
225225
// low cardinality columns
226226
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
227227
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
228+
ColumnDescr::new("bool", DataType::Boolean),
228229
ColumnDescr::new("binary", DataType::Binary),
229230
ColumnDescr::new("large_binary", DataType::LargeBinary),
230231
ColumnDescr::new("binaryview", DataType::BinaryView),

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
use std::sync::Arc;
1919

2020
use arrow::datatypes::{
21-
BinaryType, BinaryViewType, ByteArrayType, ByteViewType, Date32Type, Date64Type,
22-
Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type,
23-
Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType,
21+
BinaryType, BinaryViewType, BooleanType, ByteArrayType, ByteViewType, Date32Type,
22+
Date64Type, Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type,
23+
Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType,
2424
IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType,
2525
Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
2626
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
@@ -38,8 +38,8 @@ use rand::{
3838
};
3939
use test_utils::{
4040
array_gen::{
41-
BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator,
42-
StringArrayGenerator,
41+
BinaryArrayGenerator, BooleanArrayGenerator, DecimalArrayGenerator,
42+
PrimitiveArrayGenerator, StringArrayGenerator,
4343
},
4444
stagger_batch,
4545
};
@@ -269,6 +269,26 @@ macro_rules! generate_decimal_array {
269269
}};
270270
}
271271

272+
// Generating `BooleanArray` due to it being a special type in Arrow (bit-packed)
273+
macro_rules! generate_boolean_array {
274+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
275+
// Select a null percentage from the candidate percentages
276+
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
277+
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
278+
279+
let num_distinct_booleans = if $MAX_NUM_DISTINCT >= 2 { 2 } else { 1 };
280+
281+
let mut generator = BooleanArrayGenerator {
282+
num_booleans: $NUM_ROWS,
283+
num_distinct_booleans,
284+
null_pct,
285+
rng: $ARRAY_GEN_RNG,
286+
};
287+
288+
generator.gen_data::<$ARROW_TYPE>()
289+
}};
290+
}
291+
272292
macro_rules! generate_primitive_array {
273293
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
274294
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
@@ -689,6 +709,16 @@ impl RecordBatchGenerator {
689709
StringViewType
690710
)
691711
}
712+
DataType::Boolean => {
713+
generate_boolean_array! {
714+
self,
715+
num_rows,
716+
max_num_distinct,
717+
batch_gen_rng,
718+
array_gen_rng,
719+
BooleanType
720+
}
721+
}
692722
_ => {
693723
panic!("Unsupported data generator type: {}", col.column_type)
694724
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, BooleanArray, BooleanBuilder, UInt32Array};
19+
use arrow::compute::take;
20+
use rand::rngs::StdRng;
21+
use rand::Rng;
22+
23+
/// Randomly generate boolean arrays
24+
pub struct BooleanArrayGenerator {
25+
pub num_booleans: usize,
26+
pub num_distinct_booleans: usize,
27+
pub null_pct: f64,
28+
pub rng: StdRng,
29+
}
30+
31+
impl BooleanArrayGenerator {
32+
/// Generate BooleanArray with bit-packed values
33+
pub fn gen_data<D>(&mut self) -> ArrayRef {
34+
// Table of booleans from which to draw (distinct means 1 or 2)
35+
let distinct_booleans: BooleanArray = match self.num_distinct_booleans {
36+
1 => {
37+
let value = self.rng.gen::<bool>();
38+
let mut builder = BooleanBuilder::with_capacity(1);
39+
builder.append_value(value);
40+
builder.finish()
41+
}
42+
2 => {
43+
let mut builder = BooleanBuilder::with_capacity(2);
44+
builder.append_value(true);
45+
builder.append_value(false);
46+
builder.finish()
47+
}
48+
_ => unreachable!(),
49+
};
50+
51+
// Generate indices to select from the distinct booleans
52+
let indices: UInt32Array = (0..self.num_booleans)
53+
.map(|_| {
54+
if self.rng.gen::<f64>() < self.null_pct {
55+
None
56+
} else if self.num_distinct_booleans > 1 {
57+
Some(self.rng.gen_range(0..self.num_distinct_booleans as u32))
58+
} else {
59+
Some(0)
60+
}
61+
})
62+
.collect();
63+
64+
let options = None;
65+
66+
take(&distinct_booleans, &indices, options).unwrap()
67+
}
68+
}

test-utils/src/array_gen/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
// under the License.
1717

1818
mod binary;
19+
mod boolean;
1920
mod decimal;
2021
mod primitive;
2122
mod random_data;
2223
mod string;
2324

2425
pub use binary::BinaryArrayGenerator;
26+
pub use boolean::BooleanArrayGenerator;
2527
pub use decimal::DecimalArrayGenerator;
2628
pub use primitive::PrimitiveArrayGenerator;
2729
pub use string::StringArrayGenerator;

0 commit comments

Comments
 (0)