Skip to content

Commit f2344d2

Browse files
authored
feat: Add Time/Interval/Decimal/Utf8View in aggregate fuzz testing (apache#13226)
* support Time/Interval/Decimal types in data generator. * introduce RandomNativeData trait. * fix bug. * support utf8view type in data generator. * fix clippy. * fix bug.
1 parent cc43766 commit f2344d2

File tree

8 files changed

+433
-93
lines changed

8 files changed

+433
-93
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ use arrow::datatypes::DataType;
2323
use arrow::record_batch::RecordBatch;
2424
use arrow::util::pretty::pretty_format_batches;
2525
use arrow_array::types::Int64Type;
26+
use arrow_schema::{
27+
IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
28+
DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
29+
};
2630
use datafusion::common::Result;
2731
use datafusion::datasource::MemTable;
2832
use datafusion::physical_expr::aggregate::AggregateExprBuilder;
@@ -45,7 +49,7 @@ use crate::fuzz_cases::aggregation_fuzzer::{
4549
use datafusion_common::HashMap;
4650
use datafusion_physical_expr_common::sort_expr::LexOrdering;
4751
use rand::rngs::StdRng;
48-
use rand::{Rng, SeedableRng};
52+
use rand::{thread_rng, Rng, SeedableRng};
4953
use tokio::task::JoinSet;
5054

5155
// ========================================================================
@@ -151,6 +155,7 @@ async fn test_count() {
151155
/// 1. Floating point numbers
152156
/// 1. structured types
153157
fn baseline_config() -> DatasetGeneratorConfig {
158+
let mut rng = thread_rng();
154159
let columns = vec![
155160
ColumnDescr::new("i8", DataType::Int8),
156161
ColumnDescr::new("i16", DataType::Int16),
@@ -162,13 +167,45 @@ fn baseline_config() -> DatasetGeneratorConfig {
162167
ColumnDescr::new("u64", DataType::UInt64),
163168
ColumnDescr::new("date32", DataType::Date32),
164169
ColumnDescr::new("date64", DataType::Date64),
165-
// TODO: date/time columns
166-
// todo decimal columns
170+
ColumnDescr::new("time32_s", DataType::Time32(TimeUnit::Second)),
171+
ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)),
172+
ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)),
173+
ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)),
174+
ColumnDescr::new(
175+
"interval_year_month",
176+
DataType::Interval(IntervalUnit::YearMonth),
177+
),
178+
ColumnDescr::new(
179+
"interval_day_time",
180+
DataType::Interval(IntervalUnit::DayTime),
181+
),
182+
ColumnDescr::new(
183+
"interval_month_day_nano",
184+
DataType::Interval(IntervalUnit::MonthDayNano),
185+
),
186+
// begin decimal columns
187+
ColumnDescr::new("decimal128", {
188+
// Generate valid precision and scale for Decimal128 randomly.
189+
let precision: u8 = rng.gen_range(1..=DECIMAL128_MAX_PRECISION);
190+
// It's safe to cast `precision` to i8 type directly.
191+
let scale: i8 = rng.gen_range(
192+
i8::MIN..=std::cmp::min(precision as i8, DECIMAL128_MAX_SCALE),
193+
);
194+
DataType::Decimal128(precision, scale)
195+
}),
196+
ColumnDescr::new("decimal256", {
197+
// Generate valid precision and scale for Decimal256 randomly.
198+
let precision: u8 = rng.gen_range(1..=DECIMAL256_MAX_PRECISION);
199+
// It's safe to cast `precision` to i8 type directly.
200+
let scale: i8 = rng.gen_range(
201+
i8::MIN..=std::cmp::min(precision as i8, DECIMAL256_MAX_SCALE),
202+
);
203+
DataType::Decimal256(precision, scale)
204+
}),
167205
// begin string columns
168206
ColumnDescr::new("utf8", DataType::Utf8),
169207
ColumnDescr::new("largeutf8", DataType::LargeUtf8),
170-
// TODO add support for utf8view in data generator
171-
// ColumnDescr::new("utf8view", DataType::Utf8View),
208+
ColumnDescr::new("utf8view", DataType::Utf8View),
172209
// todo binary
173210
// low cardinality columns
174211
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs

Lines changed: 151 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
use std::sync::Arc;
1919

2020
use arrow::datatypes::{
21-
Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type,
22-
Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
21+
ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
22+
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
23+
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type,
24+
StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
25+
Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
2326
};
2427
use arrow_array::{ArrayRef, RecordBatch};
25-
use arrow_schema::{DataType, Field, Schema};
28+
use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
2629
use datafusion_common::{arrow_datafusion_err, DataFusionError, Result};
2730
use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
2831
use datafusion_physical_expr_common::sort_expr::LexOrdering;
@@ -32,7 +35,7 @@ use rand::{
3235
thread_rng, Rng, SeedableRng,
3336
};
3437
use test_utils::{
35-
array_gen::{PrimitiveArrayGenerator, StringArrayGenerator},
38+
array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator},
3639
stagger_batch,
3740
};
3841

@@ -219,7 +222,7 @@ struct RecordBatchGenerator {
219222
}
220223

221224
macro_rules! generate_string_array {
222-
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
225+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
223226
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
224227
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
225228
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
@@ -232,25 +235,47 @@ macro_rules! generate_string_array {
232235
rng: $ARRAY_GEN_RNG,
233236
};
234237

235-
generator.gen_data::<$OFFSET_TYPE>()
238+
match $ARROW_TYPE::DATA_TYPE {
239+
DataType::Utf8 => generator.gen_data::<i32>(),
240+
DataType::LargeUtf8 => generator.gen_data::<i64>(),
241+
DataType::Utf8View => generator.gen_string_view(),
242+
_ => unreachable!(),
243+
}
244+
}};
245+
}
246+
247+
macro_rules! generate_decimal_array {
248+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT: expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $PRECISION: ident, $SCALE: ident, $ARROW_TYPE: ident) => {{
249+
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
250+
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
251+
252+
let mut generator = DecimalArrayGenerator {
253+
precision: $PRECISION,
254+
scale: $SCALE,
255+
num_decimals: $NUM_ROWS,
256+
num_distinct_decimals: $MAX_NUM_DISTINCT,
257+
null_pct,
258+
rng: $ARRAY_GEN_RNG,
259+
};
260+
261+
generator.gen_data::<$ARROW_TYPE>()
236262
}};
237263
}
238264

239265
macro_rules! generate_primitive_array {
240-
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
241-
paste::paste! {{
242-
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
243-
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
244-
245-
let mut generator = PrimitiveArrayGenerator {
246-
num_primitives: $NUM_ROWS,
247-
num_distinct_primitives: $MAX_NUM_DISTINCT,
248-
null_pct,
249-
rng: $ARRAY_GEN_RNG,
250-
};
251-
252-
generator.gen_data::<$ARROW_TYPE>()
253-
}}}
266+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
267+
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
268+
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
269+
270+
let mut generator = PrimitiveArrayGenerator {
271+
num_primitives: $NUM_ROWS,
272+
num_distinct_primitives: $MAX_NUM_DISTINCT,
273+
null_pct,
274+
rng: $ARRAY_GEN_RNG,
275+
};
276+
277+
generator.gen_data::<$ARROW_TYPE>()
278+
}};
254279
}
255280

256281
impl RecordBatchGenerator {
@@ -432,14 +457,108 @@ impl RecordBatchGenerator {
432457
Date64Type
433458
)
434459
}
460+
DataType::Time32(TimeUnit::Second) => {
461+
generate_primitive_array!(
462+
self,
463+
num_rows,
464+
max_num_distinct,
465+
batch_gen_rng,
466+
array_gen_rng,
467+
Time32SecondType
468+
)
469+
}
470+
DataType::Time32(TimeUnit::Millisecond) => {
471+
generate_primitive_array!(
472+
self,
473+
num_rows,
474+
max_num_distinct,
475+
batch_gen_rng,
476+
array_gen_rng,
477+
Time32MillisecondType
478+
)
479+
}
480+
DataType::Time64(TimeUnit::Microsecond) => {
481+
generate_primitive_array!(
482+
self,
483+
num_rows,
484+
max_num_distinct,
485+
batch_gen_rng,
486+
array_gen_rng,
487+
Time64MicrosecondType
488+
)
489+
}
490+
DataType::Time64(TimeUnit::Nanosecond) => {
491+
generate_primitive_array!(
492+
self,
493+
num_rows,
494+
max_num_distinct,
495+
batch_gen_rng,
496+
array_gen_rng,
497+
Time64NanosecondType
498+
)
499+
}
500+
DataType::Interval(IntervalUnit::YearMonth) => {
501+
generate_primitive_array!(
502+
self,
503+
num_rows,
504+
max_num_distinct,
505+
batch_gen_rng,
506+
array_gen_rng,
507+
IntervalYearMonthType
508+
)
509+
}
510+
DataType::Interval(IntervalUnit::DayTime) => {
511+
generate_primitive_array!(
512+
self,
513+
num_rows,
514+
max_num_distinct,
515+
batch_gen_rng,
516+
array_gen_rng,
517+
IntervalDayTimeType
518+
)
519+
}
520+
DataType::Interval(IntervalUnit::MonthDayNano) => {
521+
generate_primitive_array!(
522+
self,
523+
num_rows,
524+
max_num_distinct,
525+
batch_gen_rng,
526+
array_gen_rng,
527+
IntervalMonthDayNanoType
528+
)
529+
}
530+
DataType::Decimal128(precision, scale) => {
531+
generate_decimal_array!(
532+
self,
533+
num_rows,
534+
max_num_distinct,
535+
batch_gen_rng,
536+
array_gen_rng,
537+
precision,
538+
scale,
539+
Decimal128Type
540+
)
541+
}
542+
DataType::Decimal256(precision, scale) => {
543+
generate_decimal_array!(
544+
self,
545+
num_rows,
546+
max_num_distinct,
547+
batch_gen_rng,
548+
array_gen_rng,
549+
precision,
550+
scale,
551+
Decimal256Type
552+
)
553+
}
435554
DataType::Utf8 => {
436555
generate_string_array!(
437556
self,
438557
num_rows,
439558
max_num_distinct,
440559
batch_gen_rng,
441560
array_gen_rng,
442-
i32
561+
Utf8Type
443562
)
444563
}
445564
DataType::LargeUtf8 => {
@@ -449,7 +568,17 @@ impl RecordBatchGenerator {
449568
max_num_distinct,
450569
batch_gen_rng,
451570
array_gen_rng,
452-
i64
571+
LargeUtf8Type
572+
)
573+
}
574+
DataType::Utf8View => {
575+
generate_string_array!(
576+
self,
577+
num_rows,
578+
max_num_distinct,
579+
batch_gen_rng,
580+
array_gen_rng,
581+
StringViewType
453582
)
454583
}
455584
_ => {

datafusion/functions-aggregate/src/min_max/min_max_bytes.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,10 @@ impl GroupsAccumulator for MinMaxBytesAccumulator {
338338
/// This is a heuristic to avoid allocating too many small buffers
339339
fn capacity_to_view_block_size(data_capacity: usize) -> u32 {
340340
let max_block_size = 2 * 1024 * 1024;
341+
// Avoid block size equal to zero when calling `with_fixed_block_size()`.
342+
if data_capacity == 0 {
343+
return 1;
344+
}
341345
if let Ok(block_size) = u32::try_from(data_capacity) {
342346
block_size.min(max_block_size)
343347
} else {
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, PrimitiveArray, PrimitiveBuilder, UInt32Array};
19+
use arrow::datatypes::DecimalType;
20+
use rand::rngs::StdRng;
21+
use rand::Rng;
22+
23+
use super::random_data::RandomNativeData;
24+
25+
/// Randomly generate decimal arrays
26+
pub struct DecimalArrayGenerator {
27+
/// The precision of the decimal type
28+
pub precision: u8,
29+
/// The scale of the decimal type
30+
pub scale: i8,
31+
/// The total number of decimals in the output
32+
pub num_decimals: usize,
33+
/// The number of distinct decimals in the columns
34+
pub num_distinct_decimals: usize,
35+
/// The percentage of nulls in the columns
36+
pub null_pct: f64,
37+
/// Random number generator
38+
pub rng: StdRng,
39+
}
40+
41+
impl DecimalArrayGenerator {
42+
/// Create a Decimal128Array / Decimal256Array with random values.
43+
pub fn gen_data<D>(&mut self) -> ArrayRef
44+
where
45+
D: DecimalType + RandomNativeData,
46+
{
47+
// table of decimals from which to draw
48+
let distinct_decimals: PrimitiveArray<D> = {
49+
let mut decimal_builder =
50+
PrimitiveBuilder::<D>::with_capacity(self.num_distinct_decimals);
51+
for _ in 0..self.num_distinct_decimals {
52+
decimal_builder
53+
.append_option(Some(D::generate_random_native_data(&mut self.rng)));
54+
}
55+
56+
decimal_builder
57+
.finish()
58+
.with_precision_and_scale(self.precision, self.scale)
59+
.unwrap()
60+
};
61+
62+
// pick num_decimals randomly from the distinct decimal table
63+
let indicies: UInt32Array = (0..self.num_decimals)
64+
.map(|_| {
65+
if self.rng.gen::<f64>() < self.null_pct {
66+
None
67+
} else if self.num_distinct_decimals > 1 {
68+
let range = 1..(self.num_distinct_decimals as u32);
69+
Some(self.rng.gen_range(range))
70+
} else {
71+
Some(0)
72+
}
73+
})
74+
.collect();
75+
76+
let options = None;
77+
arrow::compute::take(&distinct_decimals, &indicies, options).unwrap()
78+
}
79+
}

0 commit comments

Comments
 (0)