Skip to content

Commit 31d27c2

Browse files
authored
feat: Added Timestamp/Binary/Float to fuzz (#13280)
* Added Timestamp/Binary/Float to fuzz * clippy fix * small fix * remove todo * remove todo
1 parent 1557fce commit 31d27c2

File tree

6 files changed

+244
-14
lines changed

6 files changed

+244
-14
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ use datafusion_common::HashMap;
5050
use datafusion_physical_expr_common::sort_expr::LexOrdering;
5151
use rand::rngs::StdRng;
5252
use rand::{thread_rng, Rng, SeedableRng};
53+
use std::str;
5354
use tokio::task::JoinSet;
5455

5556
// ========================================================================
@@ -171,6 +172,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
171172
ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)),
172173
ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)),
173174
ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)),
175+
ColumnDescr::new("timestamp_s", DataType::Timestamp(TimeUnit::Second, None)),
176+
ColumnDescr::new(
177+
"timestamp_ms",
178+
DataType::Timestamp(TimeUnit::Millisecond, None),
179+
),
180+
ColumnDescr::new(
181+
"timestamp_us",
182+
DataType::Timestamp(TimeUnit::Microsecond, None),
183+
),
184+
ColumnDescr::new(
185+
"timestamp_ns",
186+
DataType::Timestamp(TimeUnit::Nanosecond, None),
187+
),
188+
ColumnDescr::new("float32", DataType::Float32),
189+
ColumnDescr::new("float64", DataType::Float64),
174190
ColumnDescr::new(
175191
"interval_year_month",
176192
DataType::Interval(IntervalUnit::YearMonth),
@@ -206,10 +222,12 @@ fn baseline_config() -> DatasetGeneratorConfig {
206222
ColumnDescr::new("utf8", DataType::Utf8),
207223
ColumnDescr::new("largeutf8", DataType::LargeUtf8),
208224
ColumnDescr::new("utf8view", DataType::Utf8View),
209-
// todo binary
210225
// low cardinality columns
211226
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
212227
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
228+
ColumnDescr::new("binary", DataType::Binary),
229+
ColumnDescr::new("large_binary", DataType::LargeBinary),
230+
ColumnDescr::new("binaryview", DataType::BinaryView),
213231
];
214232

215233
let min_num_rows = 512;

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs

Lines changed: 117 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
use std::sync::Arc;
1919

2020
use arrow::datatypes::{
21-
ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type,
22-
Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
23-
IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, LargeUtf8Type,
24-
StringViewType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
25-
Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
21+
BinaryType, BinaryViewType, ByteArrayType, ByteViewType, Date32Type, Date64Type,
22+
Decimal128Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type,
23+
Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType,
24+
IntervalYearMonthType, LargeBinaryType, LargeUtf8Type, StringViewType,
25+
Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType,
26+
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
27+
TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, Utf8Type,
2628
};
2729
use arrow_array::{ArrayRef, RecordBatch};
2830
use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
@@ -35,7 +37,10 @@ use rand::{
3537
thread_rng, Rng, SeedableRng,
3638
};
3739
use test_utils::{
38-
array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, StringArrayGenerator},
40+
array_gen::{
41+
BinaryArrayGenerator, DecimalArrayGenerator, PrimitiveArrayGenerator,
42+
StringArrayGenerator,
43+
},
3944
stagger_batch,
4045
};
4146

@@ -71,17 +76,19 @@ pub struct DatasetGeneratorConfig {
7176
}
7277

7378
impl DatasetGeneratorConfig {
74-
/// return a list of all column names
79+
/// Return a list of all column names
7580
pub fn all_columns(&self) -> Vec<&str> {
7681
self.columns.iter().map(|d| d.name.as_str()).collect()
7782
}
7883

79-
/// return a list of column names that are "numeric"
84+
/// Return a list of column names that are "numeric"
8085
pub fn numeric_columns(&self) -> Vec<&str> {
8186
self.columns
8287
.iter()
8388
.filter_map(|d| {
84-
if d.column_type.is_numeric() {
89+
if d.column_type.is_numeric()
90+
&& !matches!(d.column_type, DataType::Float32 | DataType::Float64)
91+
{
8592
Some(d.name.as_str())
8693
} else {
8794
None
@@ -278,6 +285,37 @@ macro_rules! generate_primitive_array {
278285
}};
279286
}
280287

288+
macro_rules! generate_binary_array {
289+
(
290+
$SELF:ident,
291+
$NUM_ROWS:ident,
292+
$MAX_NUM_DISTINCT:expr,
293+
$BATCH_GEN_RNG:ident,
294+
$ARRAY_GEN_RNG:ident,
295+
$ARROW_TYPE:ident
296+
) => {{
297+
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
298+
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
299+
300+
let max_len = $BATCH_GEN_RNG.gen_range(1..100);
301+
302+
let mut generator = BinaryArrayGenerator {
303+
max_len,
304+
num_binaries: $NUM_ROWS,
305+
num_distinct_binaries: $MAX_NUM_DISTINCT,
306+
null_pct,
307+
rng: $ARRAY_GEN_RNG,
308+
};
309+
310+
match $ARROW_TYPE::DATA_TYPE {
311+
DataType::Binary => generator.gen_data::<i32>(),
312+
DataType::LargeBinary => generator.gen_data::<i64>(),
313+
DataType::BinaryView => generator.gen_binary_view(),
314+
_ => unreachable!(),
315+
}
316+
}};
317+
}
318+
281319
impl RecordBatchGenerator {
282320
fn new(min_rows_nun: usize, max_rows_num: usize, columns: Vec<ColumnDescr>) -> Self {
283321
let candidate_null_pcts = vec![0.0, 0.01, 0.1, 0.5];
@@ -527,6 +565,76 @@ impl RecordBatchGenerator {
527565
IntervalMonthDayNanoType
528566
)
529567
}
568+
DataType::Timestamp(TimeUnit::Second, None) => {
569+
generate_primitive_array!(
570+
self,
571+
num_rows,
572+
max_num_distinct,
573+
batch_gen_rng,
574+
array_gen_rng,
575+
TimestampSecondType
576+
)
577+
}
578+
DataType::Timestamp(TimeUnit::Millisecond, None) => {
579+
generate_primitive_array!(
580+
self,
581+
num_rows,
582+
max_num_distinct,
583+
batch_gen_rng,
584+
array_gen_rng,
585+
TimestampMillisecondType
586+
)
587+
}
588+
DataType::Timestamp(TimeUnit::Microsecond, None) => {
589+
generate_primitive_array!(
590+
self,
591+
num_rows,
592+
max_num_distinct,
593+
batch_gen_rng,
594+
array_gen_rng,
595+
TimestampMicrosecondType
596+
)
597+
}
598+
DataType::Timestamp(TimeUnit::Nanosecond, None) => {
599+
generate_primitive_array!(
600+
self,
601+
num_rows,
602+
max_num_distinct,
603+
batch_gen_rng,
604+
array_gen_rng,
605+
TimestampNanosecondType
606+
)
607+
}
608+
DataType::Binary => {
609+
generate_binary_array!(
610+
self,
611+
num_rows,
612+
max_num_distinct,
613+
batch_gen_rng,
614+
array_gen_rng,
615+
BinaryType
616+
)
617+
}
618+
DataType::LargeBinary => {
619+
generate_binary_array!(
620+
self,
621+
num_rows,
622+
max_num_distinct,
623+
batch_gen_rng,
624+
array_gen_rng,
625+
LargeBinaryType
626+
)
627+
}
628+
DataType::BinaryView => {
629+
generate_binary_array!(
630+
self,
631+
num_rows,
632+
max_num_distinct,
633+
batch_gen_rng,
634+
array_gen_rng,
635+
BinaryViewType
636+
)
637+
}
530638
DataType::Decimal128(precision, scale) => {
531639
generate_decimal_array!(
532640
self,

test-utils/src/array_gen/binary.rs

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{
19+
ArrayRef, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, UInt32Array,
20+
};
21+
use arrow::compute;
22+
use rand::rngs::StdRng;
23+
use rand::Rng;
24+
25+
/// Randomly generate binary arrays
26+
pub struct BinaryArrayGenerator {
27+
/// The maximum length of each binary element
28+
pub max_len: usize,
29+
/// The total number of binaries in the output
30+
pub num_binaries: usize,
31+
/// The number of distinct binary values in the columns
32+
pub num_distinct_binaries: usize,
33+
/// The percentage of nulls in the columns
34+
pub null_pct: f64,
35+
/// Random number generator
36+
pub rng: StdRng,
37+
}
38+
39+
impl BinaryArrayGenerator {
40+
/// Creates a BinaryArray or LargeBinaryArray with random binary data.
41+
pub fn gen_data<O: OffsetSizeTrait>(&mut self) -> ArrayRef {
42+
let distinct_binaries: GenericBinaryArray<O> = (0..self.num_distinct_binaries)
43+
.map(|_| Some(random_binary(&mut self.rng, self.max_len)))
44+
.collect();
45+
46+
// Pick num_binaries randomly from the distinct binary table
47+
let indices: UInt32Array = (0..self.num_binaries)
48+
.map(|_| {
49+
if self.rng.gen::<f64>() < self.null_pct {
50+
None
51+
} else if self.num_distinct_binaries > 1 {
52+
let range = 0..(self.num_distinct_binaries as u32);
53+
Some(self.rng.gen_range(range))
54+
} else {
55+
Some(0)
56+
}
57+
})
58+
.collect();
59+
60+
compute::take(&distinct_binaries, &indices, None).unwrap()
61+
}
62+
63+
/// Creates a BinaryViewArray with random binary data.
64+
pub fn gen_binary_view(&mut self) -> ArrayRef {
65+
let distinct_binary_views: BinaryViewArray = (0..self.num_distinct_binaries)
66+
.map(|_| Some(random_binary(&mut self.rng, self.max_len)))
67+
.collect();
68+
69+
let indices: UInt32Array = (0..self.num_binaries)
70+
.map(|_| {
71+
if self.rng.gen::<f64>() < self.null_pct {
72+
None
73+
} else if self.num_distinct_binaries > 1 {
74+
let range = 0..(self.num_distinct_binaries as u32);
75+
Some(self.rng.gen_range(range))
76+
} else {
77+
Some(0)
78+
}
79+
})
80+
.collect();
81+
82+
compute::take(&distinct_binary_views, &indices, None).unwrap()
83+
}
84+
}
85+
86+
/// Return a binary vector of random bytes of length 1..=max_len
87+
fn random_binary(rng: &mut StdRng, max_len: usize) -> Vec<u8> {
88+
if max_len == 0 {
89+
Vec::new()
90+
} else {
91+
let len = rng.gen_range(1..=max_len);
92+
(0..len).map(|_| rng.gen()).collect()
93+
}
94+
}

test-utils/src/array_gen/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
mod binary;
1819
mod decimal;
1920
mod primitive;
2021
mod random_data;
2122
mod string;
2223

24+
pub use binary::BinaryArrayGenerator;
2325
pub use decimal::DecimalArrayGenerator;
2426
pub use primitive::PrimitiveArrayGenerator;
2527
pub use string::StringArrayGenerator;

test-utils/src/array_gen/primitive.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,13 @@ impl PrimitiveArrayGenerator {
5656
| DataType::Date64
5757
| DataType::Time32(_)
5858
| DataType::Time64(_)
59-
| DataType::Interval(_) => (0..self.num_distinct_primitives)
59+
| DataType::Interval(_)
60+
| DataType::Binary
61+
| DataType::LargeBinary
62+
| DataType::BinaryView
63+
| DataType::Timestamp(_, _) => (0..self.num_distinct_primitives)
6064
.map(|_| Some(A::generate_random_native_data(&mut self.rng)))
6165
.collect(),
62-
6366
_ => {
6467
let arrow_type = A::DATA_TYPE;
6568
panic!("Unsupported arrow data type: {arrow_type}")

test-utils/src/array_gen/random_data.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@ use arrow::datatypes::{
2121
Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTime,
2222
IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType,
2323
IntervalYearMonthType, Time32MillisecondType, Time32SecondType,
24-
Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type,
25-
UInt8Type,
24+
Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType,
25+
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type,
26+
UInt32Type, UInt64Type, UInt8Type,
2627
};
2728
use rand::distributions::Standard;
2829
use rand::prelude::Distribution;
@@ -66,6 +67,10 @@ basic_random_data!(Time64MicrosecondType);
6667
basic_random_data!(Time64NanosecondType);
6768
basic_random_data!(IntervalYearMonthType);
6869
basic_random_data!(Decimal128Type);
70+
basic_random_data!(TimestampSecondType);
71+
basic_random_data!(TimestampMillisecondType);
72+
basic_random_data!(TimestampMicrosecondType);
73+
basic_random_data!(TimestampNanosecondType);
6974

7075
impl RandomNativeData for Date64Type {
7176
fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {

0 commit comments

Comments
 (0)