-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Support Shredded Lists/Array in variant_get
#8354
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 9 commits
9c25cc4
ed961a4
03ecb95
158d6d7
d53c831
174e429
69de7d7
cc6d787
8f6ad1b
bc8abd9
c0d2065
85aaa3f
40b6311
f6e88ef
61ed178
1fb612d
2b6d280
398b52d
defa07b
5022acd
ed66007
196b5d4
642d192
76b3c80
35785d6
5914218
216d401
3aa6cf3
15fc8be
04b9941
857f0e2
1edfeca
6d6793d
da528c2
71412b8
0c32647
5b899d8
9cd01d2
cecd39f
a776982
cfe7c00
fc99bf0
ccbf59b
cbfa058
cf94d43
91589ad
e8e7fb1
28ec53c
279b634
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,8 +15,11 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
| use arrow::{ | ||
| array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, | ||
| compute::CastOptions, | ||
| array::{ | ||
| self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray, | ||
| UInt32Array, | ||
| }, | ||
| compute::{take, CastOptions}, | ||
| datatypes::Field, | ||
| error::Result, | ||
| }; | ||
|
|
@@ -100,12 +103,56 @@ pub(crate) fn follow_shredded_path_element( | |
|
|
||
| Ok(ShreddedPathStep::Success(struct_array.into())) | ||
| } | ||
| VariantPathElement::Index { .. } => { | ||
| VariantPathElement::Index { index } => { | ||
| // TODO: Support array indexing. Among other things, it will require slicing not | ||
| // only the array we have here, but also the corresponding metadata and null masks. | ||
| Err(ArrowError::NotYetImplemented( | ||
| "Pathing into shredded variant array index".into(), | ||
| )) | ||
| let Some(list_array) = typed_value.as_any().downcast_ref::<GenericListArray<i64>>()// <- shouldn't be just i64 | ||
| else { | ||
| // Downcast failure - if strict cast options are enabled, this should be an error | ||
| if !cast_options.safe { | ||
| return Err(ArrowError::CastError(format!( | ||
| "Cannot access index '{}' on non-list type: {}", | ||
| index, | ||
| typed_value.data_type() | ||
| ))); | ||
| } | ||
| // With safe cast options, return NULL (missing_path_step) | ||
| return Ok(missing_path_step()); | ||
| }; | ||
|
|
||
| let offsets = list_array.offsets(); | ||
| let list_len = list_array.len(); // number of lists | ||
| let values = list_array.values(); // This is a StructArray | ||
|
|
||
| let Some(struct_array) = values.as_any().downcast_ref::<StructArray>() else { | ||
| return Ok(missing_path_step()); | ||
| }; | ||
|
|
||
| let Some(field_array) = struct_array.column_by_name("typed_value") else { | ||
| return Ok(missing_path_step()); | ||
| }; | ||
|
|
||
| // Build the list of indices to take | ||
| let mut take_indices = Vec::with_capacity(list_len); | ||
| for i in 0..list_len { | ||
| let start = offsets[i] as usize; | ||
| let end = offsets[i + 1] as usize; | ||
| let len = end - start; | ||
|
|
||
| if *index < len { | ||
|
||
| take_indices.push(Some((start + index) as u32)); | ||
| } else { | ||
| take_indices.push(None); | ||
sdf-jkl marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| let index_array = UInt32Array::from(take_indices); | ||
|
|
||
| // Use Arrow compute kernel to gather elements | ||
| let taken = take(field_array, &index_array, None)?; | ||
|
||
|
|
||
| let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?; | ||
| Ok(ShreddedPathStep::Success(&state)) | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -296,18 +343,18 @@ impl<'a> GetOptions<'a> { | |
| mod test { | ||
| use std::sync::Arc; | ||
|
|
||
| use crate::{json_to_variant, VariantValueArrayBuilder}; | ||
| use arrow::array::{ | ||
| Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray, | ||
| Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, | ||
| Float16Array, GenericListArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, | ||
| StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, | ||
| }; | ||
| use arrow::buffer::NullBuffer; | ||
| use arrow::buffer::{NullBuffer, OffsetBuffer}; | ||
| use arrow::compute::CastOptions; | ||
| use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8}; | ||
| use arrow_schema::{DataType, Field, FieldRef, Fields}; | ||
| use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES}; | ||
|
|
||
| use crate::json_to_variant; | ||
| use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder}; | ||
| use crate::VariantArray; | ||
|
|
||
|
|
@@ -1305,7 +1352,101 @@ mod test { | |
| let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); | ||
| assert_eq!(&result, &expected); | ||
| } | ||
| /// This test manually constructs a shredded variant array representing lists | ||
| /// like ["comedy", "drama"] and ["horror", 123] | ||
| /// as VariantArray using variant_get. | ||
| #[test] | ||
| fn test_shredded_list_field_access() { | ||
| let array = shredded_list_variant_array(); | ||
|
|
||
| // Test: Extract the 0 index field as VariantArray first | ||
| let options = GetOptions::new_with_path(VariantPath::from(0)); | ||
| let result = variant_get(&array, options).unwrap(); | ||
|
|
||
| let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); | ||
| assert_eq!(result_variant.len(), 3); | ||
|
|
||
| // Row 0: expect 0 index = "comedy" | ||
| assert_eq!(result_variant.value(0), Variant::from("comedy")); | ||
sdf-jkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Row 1: expect 0 index = "horror" | ||
| assert_eq!(result_variant.value(1), Variant::from("horror")); | ||
| } | ||
sdf-jkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| /// Test extracting shredded list field with type conversion | ||
| #[test] | ||
| fn test_shredded_list_as_string() { | ||
| let array = shredded_list_variant_array(); | ||
|
|
||
| // Test: Extract the 0 index values as StringArray (type conversion) | ||
| let field = Field::new("typed_value", DataType::Utf8, false); | ||
| let options = GetOptions::new_with_path(VariantPath::from(0)) | ||
| .with_as_type(Some(FieldRef::from(field))); | ||
| let result = variant_get(&array, options).unwrap(); | ||
|
|
||
| // Should get StringArray | ||
| let expected: ArrayRef = | ||
| Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")])); | ||
| assert_eq!(&result, &expected); | ||
| } | ||
| /// Helper function to create a shredded variant array representing lists | ||
| /// | ||
| /// This creates an array that represents: | ||
| /// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded) | ||
| /// Row 1: ["horror", 123] ([0] is shredded, [1] is int - partially shredded) | ||
| /// | ||
| /// The physical layout follows the shredding spec where: | ||
| /// - metadata: contains list metadata | ||
| /// - typed_value: StructArray with 0 index value | ||
| /// - value: contains fallback for | ||
| fn shredded_list_variant_array() -> ArrayRef { | ||
| // Create metadata array | ||
| let metadata_array = | ||
| BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2)); | ||
|
|
||
| // Building the typed_value ListArray | ||
|
|
||
| let mut variant_value_builder = VariantValueArrayBuilder::new(8); | ||
| variant_value_builder.append_null(); | ||
| variant_value_builder.append_null(); | ||
| variant_value_builder.append_null(); | ||
| variant_value_builder.append_value(Variant::from(123i32)); | ||
|
|
||
| let struct_array = StructArrayBuilder::new() | ||
| .with_field( | ||
| "value", | ||
| Arc::new(variant_value_builder.build().unwrap()), | ||
| true, | ||
| ) | ||
| .with_field( | ||
| "typed_value", | ||
| Arc::new(StringArray::from(vec![ | ||
| Some("comedy"), | ||
| Some("drama"), | ||
| Some("horror"), | ||
| None, | ||
| ])), | ||
| true, | ||
| ) | ||
| .build(); | ||
|
|
||
| let typed_value_array = GenericListArray::<i32>::new( | ||
| Arc::new(Field::new_list_field( | ||
| struct_array.data_type().clone(), | ||
| true, | ||
| )), | ||
| OffsetBuffer::from_lengths([2, 2]), | ||
| Arc::new(struct_array), | ||
| None, | ||
| ); | ||
|
|
||
| // Build the main VariantArray | ||
| let main_struct = crate::variant_array::StructArrayBuilder::new() | ||
| .with_field("metadata", Arc::new(metadata_array), false) | ||
| // .with_field("value", Arc::new(value_array), true) | ||
sdf-jkl marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| .with_field("typed_value", Arc::new(typed_value_array), true) | ||
| .build(); | ||
|
|
||
| Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) | ||
| } | ||
| /// Helper function to create a shredded variant array representing objects | ||
| /// | ||
| /// This creates an array that represents: | ||
|
|
@@ -1360,7 +1501,7 @@ mod test { | |
| // Wrap the x field struct in a ShreddedVariantFieldArray | ||
| let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) | ||
| .expect("should create ShreddedVariantFieldArray"); | ||
|
|
||
| // Create the main typed_value as a struct containing the "x" field | ||
| let typed_value_fields = Fields::from(vec![Field::new( | ||
| "x", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.