Skip to content
Open
Changes from 9 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
9c25cc4
Add test shredded variant list array
sdf-jkl Sep 15, 2025
ed961a4
Add basic tests
sdf-jkl Sep 16, 2025
03ecb95
Merge branch 'apache:main' into shredded_list_support
sdf-jkl Sep 16, 2025
158d6d7
Merge branch 'apache:main' into shredded_list_support
sdf-jkl Sep 16, 2025
d53c831
Redo test shredded array
sdf-jkl Sep 17, 2025
174e429
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 18, 2025
69de7d7
Rebuild the shredded list array
sdf-jkl Sep 19, 2025
cc6d787
Use select::take to build the output array
sdf-jkl Sep 23, 2025
8f6ad1b
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 23, 2025
bc8abd9
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 25, 2025
c0d2065
Pass one test
sdf-jkl Sep 25, 2025
85aaa3f
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Sep 25, 2025
40b6311
Get typed values directly
sdf-jkl Sep 25, 2025
f6e88ef
Added support for utf8, largeUtf8, utf8view
sdf-jkl Oct 13, 2025
61ed178
added tests for utf8, largeUtf8, utf8view
sdf-jkl Oct 13, 2025
1fb612d
fix tests
sdf-jkl Oct 13, 2025
2b6d280
Update parquet-variant-compute/src/variant_to_arrow.rs
sdf-jkl Oct 14, 2025
398b52d
Merge branch 'main' into variant_to_arrow_utf8
sdf-jkl Oct 14, 2025
defa07b
Update parquet-variant-compute/src/variant_to_arrow.rs
sdf-jkl Oct 20, 2025
5022acd
Support LargeUtf8, Utf8-View
sdf-jkl Oct 20, 2025
ed66007
Merge branch 'main' into variant_to_arrow_utf8
sdf-jkl Oct 20, 2025
196b5d4
Fix Merge errors
sdf-jkl Oct 20, 2025
642d192
Update arrow-array/src/builder/generic_bytes_builder.rs
sdf-jkl Oct 20, 2025
76b3c80
Add docs for AVERAGE_STRING_LENGTH const
sdf-jkl Oct 21, 2025
35785d6
Merge branch 'variant_to_arrow_utf8' of https://github.com/sdf-jkl/ar…
sdf-jkl Oct 21, 2025
5914218
cargo fmt
sdf-jkl Oct 21, 2025
216d401
cargo fmt
sdf-jkl Oct 21, 2025
3aa6cf3
Merge branch 'variant_to_arrow_utf8' into shredded_list_support
sdf-jkl Oct 22, 2025
15fc8be
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Oct 22, 2025
04b9941
Quick fix variant_get
sdf-jkl Oct 24, 2025
857f0e2
Merge branch 'main' into shredded_list_support
sdf-jkl Oct 24, 2025
1edfeca
Merge branch 'main' into shredded_list_support
sdf-jkl Nov 11, 2025
6d6793d
fix merge errors
sdf-jkl Nov 11, 2025
da528c2
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Dec 3, 2025
71412b8
Merge branch 'main' into shredded_list_support
sdf-jkl Dec 3, 2025
0c32647
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Feb 19, 2026
5b899d8
Merge branch 'shredded_list_support' of https://github.com/sdf-jkl/ar…
sdf-jkl Feb 19, 2026
9cd01d2
Simplify tests using shred_variant
sdf-jkl Feb 20, 2026
cecd39f
Add tests suggested by @klion26
sdf-jkl Feb 21, 2026
a776982
Fix typed and untyped values logic
sdf-jkl Feb 21, 2026
cfe7c00
Add support for LargeListArray + OBB err when safe_cast
sdf-jkl Feb 21, 2026
fc99bf0
Use ShreddingState instead of BorrowedShreddingState in ShreddedPathS…
sdf-jkl Feb 21, 2026
ccbf59b
Reuse ShreddingState methods
sdf-jkl Feb 23, 2026
cbfa058
nit fix
sdf-jkl Feb 25, 2026
cf94d43
use else if chain
sdf-jkl Feb 25, 2026
91589ad
add cast_options.safe docs
sdf-jkl Feb 25, 2026
e8e7fb1
Merge branch 'main' of https://github.com/apache/arrow-rs into shredd…
sdf-jkl Feb 26, 2026
28ec53c
support list-like arrays
sdf-jkl Feb 26, 2026
279b634
match typed value instead of donwcast attempts
sdf-jkl Mar 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 151 additions & 10 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
// specific language governing permissions and limitations
// under the License.
use arrow::{
array::{self, Array, ArrayRef, BinaryViewArray, StructArray},
compute::CastOptions,
array::{
self, Array, ArrayRef, BinaryViewArray, GenericListArray, StructArray,
UInt32Array,
},
compute::{take, CastOptions},
datatypes::Field,
error::Result,
};
Expand Down Expand Up @@ -100,12 +103,56 @@ pub(crate) fn follow_shredded_path_element(

Ok(ShreddedPathStep::Success(struct_array.into()))
}
VariantPathElement::Index { .. } => {
VariantPathElement::Index { index } => {
// TODO: Support array indexing. Among other things, it will require slicing not
// only the array we have here, but also the corresponding metadata and null masks.
Err(ArrowError::NotYetImplemented(
"Pathing into shredded variant array index".into(),
))
let Some(list_array) = typed_value.as_any().downcast_ref::<GenericListArray<i64>>()// <- shouldn't be just i64
else {
// Downcast failure - if strict cast options are enabled, this should be an error
if !cast_options.safe {
return Err(ArrowError::CastError(format!(
"Cannot access index '{}' on non-list type: {}",
index,
typed_value.data_type()
)));
}
// With safe cast options, return NULL (missing_path_step)
return Ok(missing_path_step());
};

let offsets = list_array.offsets();
let list_len = list_array.len(); // number of lists
let values = list_array.values(); // This is a StructArray

let Some(struct_array) = values.as_any().downcast_ref::<StructArray>() else {
return Ok(missing_path_step());
};

let Some(field_array) = struct_array.column_by_name("typed_value") else {
return Ok(missing_path_step());
};

// Build the list of indices to take
let mut take_indices = Vec::with_capacity(list_len);
for i in 0..list_len {
let start = offsets[i] as usize;
let end = offsets[i + 1] as usize;
let len = end - start;

if *index < len {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please correct me if I'm wrong. Here we assert that all the values will be typed_value column, and use the indices collected here to retrieve the final value.

What if the value is located in the value column instead of the typed_value column? (change the test test_shredded_list_as_string from VariantPath::from(0) to VariantPath::from(1) can see this)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is running variant_get with the as_type parameter, specifying which type we are looking for.

The example VariantArray is shredded by String type, therefore a String value cannot be outside the typed_value column and will return a null in this test case.

Given the ["comedy", "drama"], ["horror" 123] ListArray if we try take the value from index 1 instead 0, we will get:

thread 'variant_get::test::test_shredded_list_as_string' (51616) panicked at parquet-variant-compute\src\variant_get.rs:1735:9:
assertion `left == right` failed
  left: StringArray
[
  "drama",
  null,
]
 right: StringArray
[
  "comedy",
  "horror",
]

Copy link
Member

@klion26 klion26 Dec 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for not describing it clearly.

The data["comedy", "drama"], ["horro", 123] translated into variant will be that

  • comedy, drama and horro in the typed_value column,
  • and 123 in the value column(it has an incompatible type).

Here, we retrieve all the results from the typed_value column(take in line 148), but ["hooro", 123](1)(the second item in the list) here will return null(if CastOptions::safe = true) and Err (if CastOptions::safe = false) -- currently, we return null for both of the cases.

Seems there may be something more tricky here(maybe we need to have a design note for this as this comment), such as

  • if the target_type here we request is not list/struct then, we can use the logic like here, and respect the CastOptions::safe
  • If we need to handle variant nesting here or somewhere else?
    • Here, I don't have any answer yet. I'll try to find some time next week for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: Other than (partially shredded) object fields, the shredding spec doesn't actually require any other type to shred merely because a compatible typed_value column exists. We have to assume that e.g. value could contain i8, i32, and i64 values even if typed_value is a 64-bit int. And AFAIK, we also have to assume that value could contain a variant array even if typed_value is a list. Super annoying.

Maybe this code already handles that case, but I wanted to make sure to flag it.

take_indices.push(Some((start + index) as u32));
} else {
take_indices.push(None);
}
}

let index_array = UInt32Array::from(take_indices);

// Use Arrow compute kernel to gather elements
let taken = take(field_array, &index_array, None)?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can see the basic idea here


let state = ShreddingState::try_new(None, Some(Arc::new(taken)))?;
Ok(ShreddedPathStep::Success(&state))
}
}
}
Expand Down Expand Up @@ -296,18 +343,18 @@ impl<'a> GetOptions<'a> {
mod test {
use std::sync::Arc;

use crate::{json_to_variant, VariantValueArrayBuilder};
use arrow::array::{
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray,
Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
Float16Array, GenericListArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
};
use arrow::buffer::NullBuffer;
use arrow::buffer::{NullBuffer, OffsetBuffer};
use arrow::compute::CastOptions;
use arrow::datatypes::DataType::{Int16, Int32, Int64, UInt16, UInt32, UInt64, UInt8};
use arrow_schema::{DataType, Field, FieldRef, Fields};
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};

use crate::json_to_variant;
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use crate::VariantArray;

Expand Down Expand Up @@ -1305,7 +1352,101 @@ mod test {
let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)]));
assert_eq!(&result, &expected);
}
/// This test manually constructs a shredded variant array representing lists
/// like ["comedy", "drama"] and ["horror", 123]
/// as VariantArray using variant_get.
#[test]
fn test_shredded_list_field_access() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index field as VariantArray first
let options = GetOptions::new_with_path(VariantPath::from(0));
let result = variant_get(&array, options).unwrap();

let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result_variant.len(), 3);

// Row 0: expect 0 index = "comedy"
assert_eq!(result_variant.value(0), Variant::from("comedy"));
// Row 1: expect 0 index = "horror"
assert_eq!(result_variant.value(1), Variant::from("horror"));
}
/// Test extracting shredded list field with type conversion
#[test]
fn test_shredded_list_as_string() {
let array = shredded_list_variant_array();

// Test: Extract the 0 index values as StringArray (type conversion)
let field = Field::new("typed_value", DataType::Utf8, false);
let options = GetOptions::new_with_path(VariantPath::from(0))
.with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&array, options).unwrap();

// Should get StringArray
let expected: ArrayRef =
Arc::new(StringArray::from(vec![Some("comedy"), None, Some("drama")]));
assert_eq!(&result, &expected);
}
/// Helper function to create a shredded variant array representing lists
///
/// This creates an array that represents:
/// Row 0: ["comedy", "drama"] ([0] is shredded, [1] is shredded - perfectly shredded)
/// Row 1: ["horror", 123] ([0] is shredded, [1] is int - partially shredded)
///
/// The physical layout follows the shredding spec where:
/// - metadata: contains list metadata
/// - typed_value: StructArray with 0 index value
/// - value: contains fallback for
fn shredded_list_variant_array() -> ArrayRef {
// Create metadata array
let metadata_array =
BinaryViewArray::from_iter_values(std::iter::repeat_n(EMPTY_VARIANT_METADATA_BYTES, 2));

// Building the typed_value ListArray

let mut variant_value_builder = VariantValueArrayBuilder::new(8);
variant_value_builder.append_null();
variant_value_builder.append_null();
variant_value_builder.append_null();
variant_value_builder.append_value(Variant::from(123i32));

let struct_array = StructArrayBuilder::new()
.with_field(
"value",
Arc::new(variant_value_builder.build().unwrap()),
true,
)
.with_field(
"typed_value",
Arc::new(StringArray::from(vec![
Some("comedy"),
Some("drama"),
Some("horror"),
None,
])),
true,
)
.build();

let typed_value_array = GenericListArray::<i32>::new(
Arc::new(Field::new_list_field(
struct_array.data_type().clone(),
true,
)),
OffsetBuffer::from_lengths([2, 2]),
Arc::new(struct_array),
None,
);

// Build the main VariantArray
let main_struct = crate::variant_array::StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata_array), false)
// .with_field("value", Arc::new(value_array), true)
.with_field("typed_value", Arc::new(typed_value_array), true)
.build();

Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array"))
}
/// Helper function to create a shredded variant array representing objects
///
/// This creates an array that represents:
Expand Down Expand Up @@ -1360,7 +1501,7 @@ mod test {
// Wrap the x field struct in a ShreddedVariantFieldArray
let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct)
.expect("should create ShreddedVariantFieldArray");

// Create the main typed_value as a struct containing the "x" field
let typed_value_fields = Fields::from(vec![Field::new(
"x",
Expand Down
Loading