1818use std:: sync:: Arc ;
1919
2020use arrow:: datatypes:: {
21- ByteArrayType , ByteViewType , Date32Type , Date64Type , Decimal128Type , Decimal256Type ,
22- Float32Type , Float64Type , Int16Type , Int32Type , Int64Type , Int8Type ,
23- IntervalDayTimeType , IntervalMonthDayNanoType , IntervalYearMonthType , LargeUtf8Type ,
24- StringViewType , Time32MillisecondType , Time32SecondType , Time64MicrosecondType ,
25- Time64NanosecondType , UInt16Type , UInt32Type , UInt64Type , UInt8Type , Utf8Type ,
21+ BinaryType , BinaryViewType , ByteArrayType , ByteViewType , Date32Type , Date64Type ,
22+ Decimal128Type , Decimal256Type , Float32Type , Float64Type , Int16Type , Int32Type ,
23+ Int64Type , Int8Type , IntervalDayTimeType , IntervalMonthDayNanoType ,
24+ IntervalYearMonthType , LargeBinaryType , LargeUtf8Type , StringViewType ,
25+ Time32MillisecondType , Time32SecondType , Time64MicrosecondType , Time64NanosecondType ,
26+ TimestampMicrosecondType , TimestampMillisecondType , TimestampNanosecondType ,
27+ TimestampSecondType , UInt16Type , UInt32Type , UInt64Type , UInt8Type , Utf8Type ,
2628} ;
2729use arrow_array:: { ArrayRef , RecordBatch } ;
2830use arrow_schema:: { DataType , Field , IntervalUnit , Schema , TimeUnit } ;
@@ -35,7 +37,10 @@ use rand::{
3537 thread_rng, Rng , SeedableRng ,
3638} ;
3739use test_utils:: {
38- array_gen:: { DecimalArrayGenerator , PrimitiveArrayGenerator , StringArrayGenerator } ,
40+ array_gen:: {
41+ BinaryArrayGenerator , DecimalArrayGenerator , PrimitiveArrayGenerator ,
42+ StringArrayGenerator ,
43+ } ,
3944 stagger_batch,
4045} ;
4146
@@ -71,17 +76,19 @@ pub struct DatasetGeneratorConfig {
7176}
7277
7378impl DatasetGeneratorConfig {
74- /// return a list of all column names
79+ /// Return a list of all column names
7580 pub fn all_columns ( & self ) -> Vec < & str > {
7681 self . columns . iter ( ) . map ( |d| d. name . as_str ( ) ) . collect ( )
7782 }
7883
79- /// return a list of column names that are "numeric"
84+ /// Return a list of column names that are "numeric"
8085 pub fn numeric_columns ( & self ) -> Vec < & str > {
8186 self . columns
8287 . iter ( )
8388 . filter_map ( |d| {
84- if d. column_type . is_numeric ( ) {
89+ if d. column_type . is_numeric ( )
90+ && !matches ! ( d. column_type, DataType :: Float32 | DataType :: Float64 )
91+ {
8592 Some ( d. name . as_str ( ) )
8693 } else {
8794 None
@@ -278,6 +285,37 @@ macro_rules! generate_primitive_array {
278285 } } ;
279286}
280287
288+ macro_rules! generate_binary_array {
289+ (
290+ $SELF: ident,
291+ $NUM_ROWS: ident,
292+ $MAX_NUM_DISTINCT: expr,
293+ $BATCH_GEN_RNG: ident,
294+ $ARRAY_GEN_RNG: ident,
295+ $ARROW_TYPE: ident
296+ ) => { {
297+ let null_pct_idx = $BATCH_GEN_RNG. gen_range( 0 ..$SELF. candidate_null_pcts. len( ) ) ;
298+ let null_pct = $SELF. candidate_null_pcts[ null_pct_idx] ;
299+
300+ let max_len = $BATCH_GEN_RNG. gen_range( 1 ..100 ) ;
301+
302+ let mut generator = BinaryArrayGenerator {
303+ max_len,
304+ num_binaries: $NUM_ROWS,
305+ num_distinct_binaries: $MAX_NUM_DISTINCT,
306+ null_pct,
307+ rng: $ARRAY_GEN_RNG,
308+ } ;
309+
310+ match $ARROW_TYPE:: DATA_TYPE {
311+ DataType :: Binary => generator. gen_data:: <i32 >( ) ,
312+ DataType :: LargeBinary => generator. gen_data:: <i64 >( ) ,
313+ DataType :: BinaryView => generator. gen_binary_view( ) ,
314+ _ => unreachable!( ) ,
315+ }
316+ } } ;
317+ }
318+
281319impl RecordBatchGenerator {
282320 fn new ( min_rows_nun : usize , max_rows_num : usize , columns : Vec < ColumnDescr > ) -> Self {
283321 let candidate_null_pcts = vec ! [ 0.0 , 0.01 , 0.1 , 0.5 ] ;
@@ -527,6 +565,76 @@ impl RecordBatchGenerator {
527565 IntervalMonthDayNanoType
528566 )
529567 }
568+ DataType :: Timestamp ( TimeUnit :: Second , None ) => {
569+ generate_primitive_array ! (
570+ self ,
571+ num_rows,
572+ max_num_distinct,
573+ batch_gen_rng,
574+ array_gen_rng,
575+ TimestampSecondType
576+ )
577+ }
578+ DataType :: Timestamp ( TimeUnit :: Millisecond , None ) => {
579+ generate_primitive_array ! (
580+ self ,
581+ num_rows,
582+ max_num_distinct,
583+ batch_gen_rng,
584+ array_gen_rng,
585+ TimestampMillisecondType
586+ )
587+ }
588+ DataType :: Timestamp ( TimeUnit :: Microsecond , None ) => {
589+ generate_primitive_array ! (
590+ self ,
591+ num_rows,
592+ max_num_distinct,
593+ batch_gen_rng,
594+ array_gen_rng,
595+ TimestampMicrosecondType
596+ )
597+ }
598+ DataType :: Timestamp ( TimeUnit :: Nanosecond , None ) => {
599+ generate_primitive_array ! (
600+ self ,
601+ num_rows,
602+ max_num_distinct,
603+ batch_gen_rng,
604+ array_gen_rng,
605+ TimestampNanosecondType
606+ )
607+ }
608+ DataType :: Binary => {
609+ generate_binary_array ! (
610+ self ,
611+ num_rows,
612+ max_num_distinct,
613+ batch_gen_rng,
614+ array_gen_rng,
615+ BinaryType
616+ )
617+ }
618+ DataType :: LargeBinary => {
619+ generate_binary_array ! (
620+ self ,
621+ num_rows,
622+ max_num_distinct,
623+ batch_gen_rng,
624+ array_gen_rng,
625+ LargeBinaryType
626+ )
627+ }
628+ DataType :: BinaryView => {
629+ generate_binary_array ! (
630+ self ,
631+ num_rows,
632+ max_num_distinct,
633+ batch_gen_rng,
634+ array_gen_rng,
635+ BinaryViewType
636+ )
637+ }
530638 DataType :: Decimal128 ( precision, scale) => {
531639 generate_decimal_array ! (
532640 self ,
0 commit comments