From 65a2b30ea8a77ad7d2f228e29a5520f6cd9f3fe4 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Wed, 27 Aug 2025 22:27:50 -0400 Subject: [PATCH 1/2] [Variant]: Implement `DataType::ListView/LargeListView` support for `cast_to_variant` kernel --- .../src/arrow_to_variant.rs | 90 +++++++++--- .../src/cast_to_variant.rs | 136 +++++++++++++++++- 2 files changed, 200 insertions(+), 26 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index 26713ce8ee19..6210fad8e461 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; - use crate::type_conversion::{decimal_to_variant_decimal, CastOptions}; use arrow::array::{ - Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, + Array, AsArray, GenericBinaryArray, GenericListArray, GenericListViewArray, GenericStringArray, + OffsetSizeTrait, PrimitiveArray, }; use arrow::compute::kernels::cast; use arrow::datatypes::{ @@ -36,6 +35,8 @@ use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +use std::collections::HashMap; +use std::ops::Range; // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion @@ -77,8 +78,10 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Utf8(StringArrowToVariantBuilder<'a, i32>), LargeUtf8(StringArrowToVariantBuilder<'a, i64>), Utf8View(StringViewArrowToVariantBuilder<'a>), - List(ListArrowToVariantBuilder<'a, i32>), - LargeList(ListArrowToVariantBuilder<'a, i64>), + List(ListArrowToVariantBuilder<'a, GenericListArray>), + LargeList(ListArrowToVariantBuilder<'a, GenericListArray>), + ListView(ListArrowToVariantBuilder<'a, GenericListViewArray>), + LargeListView(ListArrowToVariantBuilder<'a, GenericListViewArray>), Struct(StructArrowToVariantBuilder<'a>), Map(MapArrowToVariantBuilder<'a>), Union(UnionArrowToVariantBuilder<'a>), @@ -133,6 +136,8 @@ impl<'a> ArrowToVariantRowBuilder<'a> { Utf8View(b) => b.append_row(builder, index), List(b) => b.append_row(builder, index), LargeList(b) => b.append_row(builder, index), + ListView(b) => b.append_row(builder, index), + LargeListView(b) => b.append_row(builder, index), Struct(b) => b.append_row(builder, index), Map(b) => b.append_row(builder, index), Union(b) => b.append_row(builder, index), @@ -238,8 +243,18 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( DataType::Utf8 => Utf8(StringArrowToVariantBuilder::new(array)), DataType::LargeUtf8 => LargeUtf8(StringArrowToVariantBuilder::new(array)), DataType::Utf8View => Utf8View(StringViewArrowToVariantBuilder::new(array)), - DataType::List(_) => List(ListArrowToVariantBuilder::new(array, options)?), - DataType::LargeList(_) => LargeList(ListArrowToVariantBuilder::new(array, options)?), + DataType::List(_) => List(ListArrowToVariantBuilder::new(array.as_list(), options)?), + DataType::LargeList(_) => { + LargeList(ListArrowToVariantBuilder::new(array.as_list(), options)?) + } + DataType::ListView(_) => ListView(ListArrowToVariantBuilder::new( + array.as_list_view(), + options, + )?), + DataType::LargeListView(_) => LargeListView(ListArrowToVariantBuilder::new( + array.as_list_view(), + options, + )?), DataType::Struct(_) => Struct(StructArrowToVariantBuilder::new( array.as_struct(), options, @@ -425,7 +440,7 @@ define_row_builder!( options: &'a CastOptions, has_time_zone: bool, }, - |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, + |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { // Convert using Arrow's temporal conversion functions as_datetime::(value).map(|naive_datetime| { @@ -508,21 +523,20 @@ impl NullArrowToVariantBuilder { } } -/// Generic list builder for List and LargeList types -pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { - list_array: &'a arrow::array::GenericListArray, +/// Generic list builder for List, LargeList, ListView, and LargeListView types +pub(crate) struct ListArrowToVariantBuilder<'a, L: ListLikeArray> { + list_array: &'a L, values_builder: Box>, } -impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { - pub(crate) fn new(array: &'a dyn Array, options: &'a CastOptions) -> Result { - let list_array = array.as_list(); - let values = list_array.values(); +impl<'a, L: ListLikeArray> ListArrowToVariantBuilder<'a, L> { + pub(crate) fn new(array: &'a L, options: &'a CastOptions) -> Result { + let values = array.values(); let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref(), options)?; + make_arrow_to_variant_row_builder(values.data_type(), values, options)?; Ok(Self { - list_array, + list_array: array, values_builder: Box::new(values_builder), }) } @@ -537,12 +551,10 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { return Ok(()); } - let offsets = self.list_array.offsets(); - let start = offsets[index].as_usize(); - let end = offsets[index + 1].as_usize(); + let range = self.list_array.element_range(index); let mut list_builder = builder.try_new_list()?; - for value_index in start..end { + for value_index in range { self.values_builder .append_row(&mut list_builder, value_index)?; } @@ -551,6 +563,42 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { } } +/// Trait for list-like arrays that can provide element ranges +pub(crate) trait ListLikeArray: Array { + /// Get the values array + fn values(&self) -> &dyn Array; + + /// Get the start and end indices for a list element + fn element_range(&self, index: usize) -> Range; +} + +impl ListLikeArray for GenericListArray { + fn values(&self) -> &dyn Array { + self.values() + } + + fn element_range(&self, index: usize) -> Range { + let offsets = self.offsets(); + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); + start..end + } +} + +impl ListLikeArray for GenericListViewArray { + fn values(&self) -> &dyn Array { + self.values() + } + + fn element_range(&self, index: usize) -> Range { + let offsets = self.value_offsets(); + let sizes = self.value_sizes(); + let offset = offsets[index].as_usize(); + let size = sizes[index].as_usize(); + offset..(offset + size) + } +} + /// Struct builder for StructArray pub(crate) struct StructArrowToVariantBuilder<'a> { struct_array: &'a arrow::array::StructArray, diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 295019645f62..8e302ac77599 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -94,11 +94,11 @@ mod tests { FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeListArray, - LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder, - StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, UnionArray, + LargeListViewBuilder, LargeStringArray, ListArray, ListViewBuilder, MapArray, NullArray, + StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, UnionArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{ @@ -1258,6 +1258,132 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_list_view() { + // Create a ListViewArray with some data + let mut builder = ListViewBuilder::new(Int32Array::builder(0)); + builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let list_view_array = builder.finish(); + + // Expected values + let (metadata1, value1) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i32); + list.append_value(1i32); + list.append_value(2i32); + list.finish(); + builder.finish() + }; + let variant1 = Variant::new(&metadata1, &value1); + + let (metadata2, value2) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i32); + list.append_value(4i32); + list.finish(); + builder.finish() + }; + let variant2 = Variant::new(&metadata2, &value2); + + run_test( + Arc::new(list_view_array), + vec![Some(variant1), Some(variant2), None], + ); + } + + #[test] + fn test_cast_to_variant_sliced_list_view() { + // Create a ListViewArray with some data + let mut builder = ListViewBuilder::new(Int32Array::builder(0)); + builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let list_view_array = builder.finish(); + + // Expected value for slice(1, 2) - should get the second and third elements + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i32); + list.append_value(4i32); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(list_view_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + + #[test] + fn test_cast_to_variant_large_list_view() { + // Create a LargeListViewArray with some data + let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); + builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let large_list_view_array = builder.finish(); + + // Expected values + let (metadata1, value1) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i64); + list.append_value(1i64); + list.append_value(2i64); + list.finish(); + builder.finish() + }; + let variant1 = Variant::new(&metadata1, &value1); + + let (metadata2, value2) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i64); + list.append_value(4i64); + list.finish(); + builder.finish() + }; + let variant2 = Variant::new(&metadata2, &value2); + + run_test( + Arc::new(large_list_view_array), + vec![Some(variant1), Some(variant2), None], + ); + } + + #[test] + fn test_cast_to_variant_sliced_large_list_view() { + // Create a LargeListViewArray with some data + let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); + builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let large_list_view_array = builder.finish(); + + // Expected value for slice(1, 2) - should get the second and third elements + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i64); + list.append_value(4i64); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(large_list_view_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + #[test] fn test_cast_to_variant_struct() { // Test a simple struct with two fields: id (int64) and age (int32) From 4979677173adb38ff36520bc8c64d0fd5c15ca25 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Tue, 23 Sep 2025 19:28:49 -0400 Subject: [PATCH 2/2] Add test cases where the array element is NULL --- .../src/cast_to_variant.rs | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 8e302ac77599..7db5d2d3cda6 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -112,7 +112,8 @@ mod tests { use chrono::{DateTime, NaiveDate, NaiveTime}; use half::f16; use parquet_variant::{ - Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, + Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, + VariantDecimal8, }; use std::{sync::Arc, vec}; @@ -1262,24 +1263,25 @@ mod tests { fn test_cast_to_variant_list_view() { // Create a ListViewArray with some data let mut builder = ListViewBuilder::new(Int32Array::builder(0)); - builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int32Array::from(vec![Some(0), None, Some(2)])); builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); builder.append_null(); + builder.append_value(&Int32Array::from(vec![None, None])); let list_view_array = builder.finish(); // Expected values - let (metadata1, value1) = { + let (metadata, value) = { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(0i32); - list.append_value(1i32); + list.append_null(); list.append_value(2i32); list.finish(); builder.finish() }; - let variant1 = Variant::new(&metadata1, &value1); + let variant0 = Variant::new(&metadata, &value); - let (metadata2, value2) = { + let (metadata, value) = { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(3i32); @@ -1287,11 +1289,21 @@ mod tests { list.finish(); builder.finish() }; - let variant2 = Variant::new(&metadata2, &value2); + let variant1 = Variant::new(&metadata, &value); + + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_null(); + list.append_null(); + list.finish(); + builder.finish() + }; + let variant3 = Variant::new(&metadata, &value); run_test( Arc::new(list_view_array), - vec![Some(variant1), Some(variant2), None], + vec![Some(variant0), Some(variant1), None, Some(variant3)], ); } @@ -1300,7 +1312,7 @@ mod tests { // Create a ListViewArray with some data let mut builder = ListViewBuilder::new(Int32Array::builder(0)); builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); - builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); + builder.append_value(&Int32Array::from(vec![Some(3), None])); builder.append_null(); let list_view_array = builder.finish(); @@ -1309,7 +1321,7 @@ mod tests { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(3i32); - list.append_value(4i32); + list.append_null(); list.finish(); builder.finish() }; @@ -1325,24 +1337,25 @@ mod tests { fn test_cast_to_variant_large_list_view() { // Create a LargeListViewArray with some data let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); - builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int64Array::from(vec![Some(0), None, Some(2)])); builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); builder.append_null(); + builder.append_value(&Int64Array::from(vec![None, None])); let large_list_view_array = builder.finish(); // Expected values - let (metadata1, value1) = { + let (metadata, value) = { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(0i64); - list.append_value(1i64); + list.append_null(); list.append_value(2i64); list.finish(); builder.finish() }; - let variant1 = Variant::new(&metadata1, &value1); + let variant0 = Variant::new(&metadata, &value); - let (metadata2, value2) = { + let (metadata, value) = { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(3i64); @@ -1350,11 +1363,21 @@ mod tests { list.finish(); builder.finish() }; - let variant2 = Variant::new(&metadata2, &value2); + let variant1 = Variant::new(&metadata, &value); + + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_null(); + list.append_null(); + list.finish(); + builder.finish() + }; + let variant3 = Variant::new(&metadata, &value); run_test( Arc::new(large_list_view_array), - vec![Some(variant1), Some(variant2), None], + vec![Some(variant0), Some(variant1), None, Some(variant3)], ); } @@ -1363,7 +1386,7 @@ mod tests { // Create a LargeListViewArray with some data let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); - builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); + builder.append_value(&Int64Array::from(vec![Some(3), None])); builder.append_null(); let large_list_view_array = builder.finish(); @@ -1372,7 +1395,7 @@ mod tests { let mut builder = VariantBuilder::new(); let mut list = builder.new_list(); list.append_value(3i64); - list.append_value(4i64); + list.append_null(); list.finish(); builder.finish() };