Skip to content

Commit 65702b7

Browse files
refactor, add tests for list type fields
1 parent 93c82e9 commit 65702b7

File tree

1 file changed

+177
-18
lines changed

1 file changed

+177
-18
lines changed

src/storage/field_stats.rs

Lines changed: 177 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -355,23 +355,22 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
355355
} else {
356356
"false".to_string()
357357
}),
358-
DataType::List(_field) => {
359-
let list_array = array
360-
.as_any()
361-
.downcast_ref::<arrow_array::ListArray>()
362-
.expect("Expected ListArray");
363-
364-
let child_array = list_array.values();
365-
let offsets = list_array.value_offsets();
366-
let start = offsets[idx] as usize;
367-
let end = offsets[idx + 1] as usize;
368-
369-
let formatted_values: Vec<String> = (start..end)
370-
.map(|i| format_arrow_value(child_array.as_ref(), i))
371-
.collect();
372-
373-
format!("[{}]", formatted_values.join(", "))
374-
}
358+
DataType::List(_field) => try_downcast!(
359+
arrow_array::ListArray,
360+
array,
361+
|list_array: &arrow_array::ListArray| {
362+
let child_array = list_array.values();
363+
let offsets = list_array.value_offsets();
364+
let start = offsets[idx] as usize;
365+
let end = offsets[idx + 1] as usize;
366+
367+
let formatted_values: Vec<String> = (start..end)
368+
.map(|i| format_arrow_value(child_array.as_ref(), i))
369+
.collect();
370+
371+
format!("[{}]", formatted_values.join(", "))
372+
}
373+
),
375374
DataType::Null => "NULL".to_string(),
376375
_ => {
377376
warn!(
@@ -387,8 +386,10 @@ fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
387386
mod tests {
388387
use std::{fs::OpenOptions, sync::Arc};
389388

389+
use arrow::buffer::OffsetBuffer;
390390
use arrow_array::{
391-
BooleanArray, Float64Array, Int64Array, RecordBatch, StringArray, TimestampMillisecondArray,
391+
BooleanArray, Float64Array, Int64Array, ListArray, RecordBatch, StringArray,
392+
TimestampMillisecondArray,
392393
};
393394
use arrow_schema::{DataType, Field, Schema, TimeUnit};
394395
use datafusion::prelude::{ParquetReadOptions, SessionContext};
@@ -467,6 +468,35 @@ mod tests {
467468
Some("constant"),
468469
]);
469470

471+
// Create List<Int64> field
472+
let int_list_data = Int64Array::from(vec![
473+
1, 2, 3, 4, 5, 1, 2, 3, 6, 7, 8, 9, 1, 4, 5, 10, 11, 12, 13, 14, 1, 2, 12, 13, 14, 1, 2,
474+
]);
475+
let int_list_offsets =
476+
OffsetBuffer::new(vec![0, 3, 5, 8, 12, 13, 15, 17, 17, 20, 22].into());
477+
let int_list_field = Arc::new(Field::new("item", DataType::Int64, false));
478+
let int_list_array = ListArray::new(
479+
int_list_field,
480+
int_list_offsets,
481+
Arc::new(int_list_data),
482+
None,
483+
);
484+
485+
// Create List<Float64> field
486+
let float_list_data = Float64Array::from(vec![
487+
1.1, 2.2, 3.3, 4.4, 5.5, 1.1, 2.2, 6.6, 7.7, 8.8, 9.9, 3.3, 4.4, 5.5, 10.0, 11.1, 12.2,
488+
13.3,
489+
]);
490+
let float_list_offsets =
491+
OffsetBuffer::new(vec![0, 2, 5, 7, 8, 11, 14, 15, 15, 17, 18].into());
492+
let float_list_field = Arc::new(Field::new("item", DataType::Float64, false));
493+
let float_list_array = ListArray::new(
494+
float_list_field,
495+
float_list_offsets,
496+
Arc::new(float_list_data),
497+
None,
498+
);
499+
470500
let batch = RecordBatch::try_new(
471501
schema.clone(),
472502
vec![
@@ -476,6 +506,8 @@ mod tests {
476506
Arc::new(active_array),
477507
Arc::new(timestamp_array),
478508
Arc::new(single_value_array),
509+
Arc::new(int_list_array),
510+
Arc::new(float_list_array),
479511
],
480512
)
481513
.unwrap();
@@ -505,6 +537,16 @@ mod tests {
505537
true,
506538
),
507539
Field::new("single_value", DataType::Utf8, true),
540+
Field::new_list(
541+
"int_list",
542+
Arc::new(Field::new("item", DataType::Int64, false)),
543+
true,
544+
),
545+
Field::new_list(
546+
"float_list",
547+
Arc::new(Field::new("item", DataType::Float64, false)),
548+
true,
549+
),
508550
])
509551
}
510552

@@ -910,4 +952,121 @@ mod tests {
910952
assert_eq!(distinct_stat.count, 100);
911953
}
912954
}
955+
956+
#[tokio::test]
957+
async fn test_calculate_single_field_stats_with_int_list_field() {
958+
let (_temp_dir, parquet_path) = create_test_parquet_with_data().await;
959+
960+
let ctx = SessionContext::new();
961+
let table_name = ulid::Ulid::new().to_string();
962+
963+
ctx.register_parquet(
964+
&table_name,
965+
parquet_path.to_str().unwrap(),
966+
ParquetReadOptions::default(),
967+
)
968+
.await
969+
.unwrap();
970+
971+
// Test int_list field (List<Int64>)
972+
let result = calculate_single_field_stats(ctx.clone(), &table_name, "int_list", 50).await;
973+
974+
assert!(result.is_some());
975+
let stats = result.unwrap();
976+
977+
assert_eq!(stats.field_name, "int_list");
978+
assert_eq!(stats.count, 10);
979+
980+
// Verify we have the expected distinct lists
981+
// Expected: [1, 2, 3], [4, 5], [6, 7, 8, 9], [1], [10, 11], [], [12, 13, 14], [1, 2]
982+
assert_eq!(stats.distinct_count, 8);
983+
984+
// Check for duplicate lists - [1, 2, 3] appears twice, [4, 5] appears twice
985+
let list_123_stat = stats
986+
.distinct_stats
987+
.iter()
988+
.find(|s| s.distinct_value == "[1, 2, 3]");
989+
assert!(list_123_stat.is_some());
990+
assert_eq!(list_123_stat.unwrap().count, 2);
991+
992+
let list_45_stat = stats
993+
.distinct_stats
994+
.iter()
995+
.find(|s| s.distinct_value == "[4, 5]");
996+
assert!(list_45_stat.is_some());
997+
assert_eq!(list_45_stat.unwrap().count, 2);
998+
999+
// Check single occurrence lists
1000+
let list_6789_stat = stats
1001+
.distinct_stats
1002+
.iter()
1003+
.find(|s| s.distinct_value == "[6, 7, 8, 9]");
1004+
assert!(list_6789_stat.is_some());
1005+
assert_eq!(list_6789_stat.unwrap().count, 1);
1006+
1007+
let empty_list_stat = stats
1008+
.distinct_stats
1009+
.iter()
1010+
.find(|s| s.distinct_value == "[]");
1011+
assert!(empty_list_stat.is_some());
1012+
assert_eq!(empty_list_stat.unwrap().count, 1);
1013+
}
1014+
1015+
#[tokio::test]
1016+
async fn test_calculate_single_field_stats_with_float_list_field() {
1017+
let (_temp_dir, parquet_path) = create_test_parquet_with_data().await;
1018+
1019+
let ctx = SessionContext::new();
1020+
let table_name = ulid::Ulid::new().to_string();
1021+
1022+
ctx.register_parquet(
1023+
&table_name,
1024+
parquet_path.to_str().unwrap(),
1025+
ParquetReadOptions::default(),
1026+
)
1027+
.await
1028+
.unwrap();
1029+
1030+
// Test float_list field (List<Float64>)
1031+
let result = calculate_single_field_stats(ctx.clone(), &table_name, "float_list", 50).await;
1032+
1033+
assert!(result.is_some());
1034+
let stats = result.unwrap();
1035+
1036+
assert_eq!(stats.field_name, "float_list");
1037+
assert_eq!(stats.count, 10);
1038+
1039+
// Expected distinct lists: [1.1, 2.2], [3.3, 4.4, 5.5], [6.6], [7.7, 8.8, 9.9], [10.0], [], [11.1, 12.2], [13.3]
1040+
assert_eq!(stats.distinct_count, 8);
1041+
1042+
// Check for duplicate lists - [1.1, 2.2] appears twice, [3.3, 4.4, 5.5] appears twice
1043+
let list_11_22_stat = stats
1044+
.distinct_stats
1045+
.iter()
1046+
.find(|s| s.distinct_value == "[1.1, 2.2]");
1047+
assert!(list_11_22_stat.is_some());
1048+
assert_eq!(list_11_22_stat.unwrap().count, 2);
1049+
1050+
let list_33_44_55_stat = stats
1051+
.distinct_stats
1052+
.iter()
1053+
.find(|s| s.distinct_value == "[3.3, 4.4, 5.5]");
1054+
assert!(list_33_44_55_stat.is_some());
1055+
assert_eq!(list_33_44_55_stat.unwrap().count, 2);
1056+
1057+
// Check single occurrence lists
1058+
let list_66_stat = stats
1059+
.distinct_stats
1060+
.iter()
1061+
.find(|s| s.distinct_value == "[6.6]");
1062+
assert!(list_66_stat.is_some());
1063+
assert_eq!(list_66_stat.unwrap().count, 1);
1064+
1065+
let empty_list_stat = stats
1066+
.distinct_stats
1067+
.iter()
1068+
.find(|s| s.distinct_value == "[]");
1069+
assert!(empty_list_stat.is_some());
1070+
assert_eq!(empty_list_stat.unwrap().count, 1);
1071+
}
9131072
}

0 commit comments

Comments
 (0)