@@ -355,23 +355,22 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
355
355
} else {
356
356
"false" . to_string( )
357
357
} ) ,
358
- DataType :: List ( _field) => {
359
- let list_array = array
360
- . as_any ( )
361
- . downcast_ref :: < arrow_array:: ListArray > ( )
362
- . expect ( "Expected ListArray" ) ;
363
-
364
- let child_array = list_array. values ( ) ;
365
- let offsets = list_array. value_offsets ( ) ;
366
- let start = offsets[ idx] as usize ;
367
- let end = offsets[ idx + 1 ] as usize ;
368
-
369
- let formatted_values: Vec < String > = ( start..end)
370
- . map ( |i| format_arrow_value ( child_array. as_ref ( ) , i) )
371
- . collect ( ) ;
372
-
373
- format ! ( "[{}]" , formatted_values. join( ", " ) )
374
- }
358
+ DataType :: List ( _field) => try_downcast ! (
359
+ arrow_array:: ListArray ,
360
+ array,
361
+ |list_array: & arrow_array:: ListArray | {
362
+ let child_array = list_array. values( ) ;
363
+ let offsets = list_array. value_offsets( ) ;
364
+ let start = offsets[ idx] as usize ;
365
+ let end = offsets[ idx + 1 ] as usize ;
366
+
367
+ let formatted_values: Vec <String > = ( start..end)
368
+ . map( |i| format_arrow_value( child_array. as_ref( ) , i) )
369
+ . collect( ) ;
370
+
371
+ format!( "[{}]" , formatted_values. join( ", " ) )
372
+ }
373
+ ) ,
375
374
DataType :: Null => "NULL" . to_string ( ) ,
376
375
_ => {
377
376
warn ! (
@@ -387,8 +386,10 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
387
386
mod tests {
388
387
use std:: { fs:: OpenOptions , sync:: Arc } ;
389
388
389
+ use arrow:: buffer:: OffsetBuffer ;
390
390
use arrow_array:: {
391
- BooleanArray , Float64Array , Int64Array , RecordBatch , StringArray , TimestampMillisecondArray ,
391
+ BooleanArray , Float64Array , Int64Array , ListArray , RecordBatch , StringArray ,
392
+ TimestampMillisecondArray ,
392
393
} ;
393
394
use arrow_schema:: { DataType , Field , Schema , TimeUnit } ;
394
395
use datafusion:: prelude:: { ParquetReadOptions , SessionContext } ;
@@ -467,6 +468,35 @@ mod tests {
467
468
Some ( "constant" ) ,
468
469
] ) ;
469
470
471
+ // Create List<Int64> field
472
+ let int_list_data = Int64Array :: from ( vec ! [
473
+ 1 , 2 , 3 , 4 , 5 , 1 , 2 , 3 , 6 , 7 , 8 , 9 , 1 , 4 , 5 , 10 , 11 , 12 , 13 , 14 , 1 , 2 , 12 , 13 , 14 , 1 , 2 ,
474
+ ] ) ;
475
+ let int_list_offsets =
476
+ OffsetBuffer :: new ( vec ! [ 0 , 3 , 5 , 8 , 12 , 13 , 15 , 17 , 17 , 20 , 22 ] . into ( ) ) ;
477
+ let int_list_field = Arc :: new ( Field :: new ( "item" , DataType :: Int64 , false ) ) ;
478
+ let int_list_array = ListArray :: new (
479
+ int_list_field,
480
+ int_list_offsets,
481
+ Arc :: new ( int_list_data) ,
482
+ None ,
483
+ ) ;
484
+
485
+ // Create List<Float64> field
486
+ let float_list_data = Float64Array :: from ( vec ! [
487
+ 1.1 , 2.2 , 3.3 , 4.4 , 5.5 , 1.1 , 2.2 , 6.6 , 7.7 , 8.8 , 9.9 , 3.3 , 4.4 , 5.5 , 10.0 , 11.1 , 12.2 ,
488
+ 13.3 ,
489
+ ] ) ;
490
+ let float_list_offsets =
491
+ OffsetBuffer :: new ( vec ! [ 0 , 2 , 5 , 7 , 8 , 11 , 14 , 15 , 15 , 17 , 18 ] . into ( ) ) ;
492
+ let float_list_field = Arc :: new ( Field :: new ( "item" , DataType :: Float64 , false ) ) ;
493
+ let float_list_array = ListArray :: new (
494
+ float_list_field,
495
+ float_list_offsets,
496
+ Arc :: new ( float_list_data) ,
497
+ None ,
498
+ ) ;
499
+
470
500
let batch = RecordBatch :: try_new (
471
501
schema. clone ( ) ,
472
502
vec ! [
@@ -476,6 +506,8 @@ mod tests {
476
506
Arc :: new( active_array) ,
477
507
Arc :: new( timestamp_array) ,
478
508
Arc :: new( single_value_array) ,
509
+ Arc :: new( int_list_array) ,
510
+ Arc :: new( float_list_array) ,
479
511
] ,
480
512
)
481
513
. unwrap ( ) ;
@@ -505,6 +537,16 @@ mod tests {
505
537
true ,
506
538
) ,
507
539
Field :: new( "single_value" , DataType :: Utf8 , true ) ,
540
+ Field :: new_list(
541
+ "int_list" ,
542
+ Arc :: new( Field :: new( "item" , DataType :: Int64 , false ) ) ,
543
+ true ,
544
+ ) ,
545
+ Field :: new_list(
546
+ "float_list" ,
547
+ Arc :: new( Field :: new( "item" , DataType :: Float64 , false ) ) ,
548
+ true ,
549
+ ) ,
508
550
] )
509
551
}
510
552
@@ -910,4 +952,121 @@ mod tests {
910
952
assert_eq ! ( distinct_stat. count, 100 ) ;
911
953
}
912
954
}
955
+
956
+ #[ tokio:: test]
957
+ async fn test_calculate_single_field_stats_with_int_list_field ( ) {
958
+ let ( _temp_dir, parquet_path) = create_test_parquet_with_data ( ) . await ;
959
+
960
+ let ctx = SessionContext :: new ( ) ;
961
+ let table_name = ulid:: Ulid :: new ( ) . to_string ( ) ;
962
+
963
+ ctx. register_parquet (
964
+ & table_name,
965
+ parquet_path. to_str ( ) . unwrap ( ) ,
966
+ ParquetReadOptions :: default ( ) ,
967
+ )
968
+ . await
969
+ . unwrap ( ) ;
970
+
971
+ // Test int_list field (List<Int64>)
972
+ let result = calculate_single_field_stats ( ctx. clone ( ) , & table_name, "int_list" , 50 ) . await ;
973
+
974
+ assert ! ( result. is_some( ) ) ;
975
+ let stats = result. unwrap ( ) ;
976
+
977
+ assert_eq ! ( stats. field_name, "int_list" ) ;
978
+ assert_eq ! ( stats. count, 10 ) ;
979
+
980
+ // Verify we have the expected distinct lists
981
+ // Expected: [1, 2, 3], [4, 5], [6, 7, 8, 9], [1], [10, 11], [], [12, 13, 14], [1, 2]
982
+ assert_eq ! ( stats. distinct_count, 8 ) ;
983
+
984
+ // Check for duplicate lists - [1, 2, 3] appears twice, [4, 5] appears twice
985
+ let list_123_stat = stats
986
+ . distinct_stats
987
+ . iter ( )
988
+ . find ( |s| s. distinct_value == "[1, 2, 3]" ) ;
989
+ assert ! ( list_123_stat. is_some( ) ) ;
990
+ assert_eq ! ( list_123_stat. unwrap( ) . count, 2 ) ;
991
+
992
+ let list_45_stat = stats
993
+ . distinct_stats
994
+ . iter ( )
995
+ . find ( |s| s. distinct_value == "[4, 5]" ) ;
996
+ assert ! ( list_45_stat. is_some( ) ) ;
997
+ assert_eq ! ( list_45_stat. unwrap( ) . count, 2 ) ;
998
+
999
+ // Check single occurrence lists
1000
+ let list_6789_stat = stats
1001
+ . distinct_stats
1002
+ . iter ( )
1003
+ . find ( |s| s. distinct_value == "[6, 7, 8, 9]" ) ;
1004
+ assert ! ( list_6789_stat. is_some( ) ) ;
1005
+ assert_eq ! ( list_6789_stat. unwrap( ) . count, 1 ) ;
1006
+
1007
+ let empty_list_stat = stats
1008
+ . distinct_stats
1009
+ . iter ( )
1010
+ . find ( |s| s. distinct_value == "[]" ) ;
1011
+ assert ! ( empty_list_stat. is_some( ) ) ;
1012
+ assert_eq ! ( empty_list_stat. unwrap( ) . count, 1 ) ;
1013
+ }
1014
+
1015
+ #[ tokio:: test]
1016
+ async fn test_calculate_single_field_stats_with_float_list_field ( ) {
1017
+ let ( _temp_dir, parquet_path) = create_test_parquet_with_data ( ) . await ;
1018
+
1019
+ let ctx = SessionContext :: new ( ) ;
1020
+ let table_name = ulid:: Ulid :: new ( ) . to_string ( ) ;
1021
+
1022
+ ctx. register_parquet (
1023
+ & table_name,
1024
+ parquet_path. to_str ( ) . unwrap ( ) ,
1025
+ ParquetReadOptions :: default ( ) ,
1026
+ )
1027
+ . await
1028
+ . unwrap ( ) ;
1029
+
1030
+ // Test float_list field (List<Float64>)
1031
+ let result = calculate_single_field_stats ( ctx. clone ( ) , & table_name, "float_list" , 50 ) . await ;
1032
+
1033
+ assert ! ( result. is_some( ) ) ;
1034
+ let stats = result. unwrap ( ) ;
1035
+
1036
+ assert_eq ! ( stats. field_name, "float_list" ) ;
1037
+ assert_eq ! ( stats. count, 10 ) ;
1038
+
1039
+ // Expected distinct lists: [1.1, 2.2], [3.3, 4.4, 5.5], [6.6], [7.7, 8.8, 9.9], [10.0], [], [11.1, 12.2], [13.3]
1040
+ assert_eq ! ( stats. distinct_count, 8 ) ;
1041
+
1042
+ // Check for duplicate lists - [1.1, 2.2] appears twice, [3.3, 4.4, 5.5] appears twice
1043
+ let list_11_22_stat = stats
1044
+ . distinct_stats
1045
+ . iter ( )
1046
+ . find ( |s| s. distinct_value == "[1.1, 2.2]" ) ;
1047
+ assert ! ( list_11_22_stat. is_some( ) ) ;
1048
+ assert_eq ! ( list_11_22_stat. unwrap( ) . count, 2 ) ;
1049
+
1050
+ let list_33_44_55_stat = stats
1051
+ . distinct_stats
1052
+ . iter ( )
1053
+ . find ( |s| s. distinct_value == "[3.3, 4.4, 5.5]" ) ;
1054
+ assert ! ( list_33_44_55_stat. is_some( ) ) ;
1055
+ assert_eq ! ( list_33_44_55_stat. unwrap( ) . count, 2 ) ;
1056
+
1057
+ // Check single occurrence lists
1058
+ let list_66_stat = stats
1059
+ . distinct_stats
1060
+ . iter ( )
1061
+ . find ( |s| s. distinct_value == "[6.6]" ) ;
1062
+ assert ! ( list_66_stat. is_some( ) ) ;
1063
+ assert_eq ! ( list_66_stat. unwrap( ) . count, 1 ) ;
1064
+
1065
+ let empty_list_stat = stats
1066
+ . distinct_stats
1067
+ . iter ( )
1068
+ . find ( |s| s. distinct_value == "[]" ) ;
1069
+ assert ! ( empty_list_stat. is_some( ) ) ;
1070
+ assert_eq ! ( empty_list_stat. unwrap( ) . count, 1 ) ;
1071
+ }
913
1072
}
0 commit comments