@@ -7,7 +7,8 @@ use vortex_dtype::{IntegerPType, Nullability, match_each_integer_ptype};
77use vortex_error:: VortexExpect ;
88use vortex_scalar:: Scalar ;
99
10- use crate :: arrays:: { ChunkedArray , ListViewArray } ;
10+ use crate :: arrays:: ListViewArray ;
11+ use crate :: builders:: builder_with_capacity;
1112use crate :: vtable:: ValidityHelper ;
1213use crate :: { Array , IntoArray , ToCanonical , compute} ;
1314
@@ -74,6 +75,13 @@ impl ListViewArray {
7475 let offsets_ptype = self . offsets ( ) . dtype ( ) . as_ptype ( ) ;
7576 let sizes_ptype = self . sizes ( ) . dtype ( ) . as_ptype ( ) ;
7677
78+ // One of the main purposes behind adding this "zero-copyable to `ListArray`" optimization
79+ // is that we want to pass data to systems that expect Arrow data.
80+ // The arrow specification only allows for `i32` and `i64` offset and sizes types, so in
81+ // order to also make `ListView` zero-copyable to **Arrow**'s `ListArray` (not just Vortex's
82+ // `ListArray`), we rebuild the offsets as 32-bit or 64-bit integer types.
83+ // TODO(connor)[ListView]: This is true for `sizes` as well, we should do this conversion
84+ // for sizes as well.
7785 match_each_integer_ptype ! ( sizes_ptype, |S | {
7886 match offsets_ptype {
7987 PType :: U8 => self . naive_rebuild:: <u8 , u32 , S >( ) ,
@@ -89,8 +97,10 @@ impl ListViewArray {
8997 } )
9098 }
9199
92- /// The inner function for `rebuild_zero_copy_to_list`, which naively rebuilds a `ListViewArray`
93- /// via `append_scalar`.
100+ // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements
101+ // instead of using a builder.
102+ /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
103+ /// by piece.
94104 fn naive_rebuild < O : IntegerPType , NewOffset : IntegerPType , S : IntegerPType > (
95105 & self ,
96106 ) -> ListViewArray {
@@ -99,59 +109,72 @@ impl ListViewArray {
99109 . as_list_element_opt ( )
100110 . vortex_expect ( "somehow had a canonical list that was not a list" ) ;
101111
102- // Upfront canonicalize the list elements, we're going to be doing a lot of
103- // slicing with them.
104- let elements_canonical = self . elements ( ) . to_canonical ( ) . into_array ( ) ;
105112 let offsets_canonical = self . offsets ( ) . to_primitive ( ) ;
113+ let offsets_slice = offsets_canonical. as_slice :: < O > ( ) ;
106114 let sizes_canonical = self . sizes ( ) . to_primitive ( ) ;
115+ let sizes_slice = sizes_canonical. as_slice :: < S > ( ) ;
107116
108- let offsets_canonical = offsets_canonical. as_slice :: < O > ( ) ;
109- let sizes_canonical = sizes_canonical. as_slice :: < S > ( ) ;
117+ let len = offsets_slice. len ( ) ;
110118
111- let mut offsets = BufferMut :: < NewOffset > :: with_capacity ( self . len ( ) ) ;
112- let mut sizes = BufferMut :: < S > :: with_capacity ( self . len ( ) ) ;
119+ let mut new_offsets = BufferMut :: < NewOffset > :: with_capacity ( len) ;
120+ // TODO(connor)[ListView]: Do we really need to do this?
121+ // The only reason we need to rebuild the sizes here is that the validity may indicate that
122+ // a list is null even though it has a non-zero size. This rebuild will set the size of all
123+ // null lists to 0.
124+ let mut new_sizes = BufferMut :: < S > :: with_capacity ( len) ;
113125
114- let mut chunks = Vec :: with_capacity ( self . len ( ) ) ;
126+ // Canonicalize the elements up front as we will be slicing the elements quite a lot.
127+ let elements_canonical = self . elements ( ) . to_canonical ( ) . into_array ( ) ;
115128
116- let mut n_elements = NewOffset :: zero ( ) ;
129+ // Note that we do not know what the exact capacity should be of the new elements since
130+ // there could be overlaps in the existing `ListViewArray`.
131+ let mut new_elements_builder =
132+ builder_with_capacity ( element_dtype. as_ref ( ) , self . elements ( ) . len ( ) ) ;
117133
118- for index in 0 ..self . len ( ) {
134+ let mut n_elements = NewOffset :: zero ( ) ;
135+ for index in 0 ..len {
119136 if !self . is_valid ( index) {
120- offsets. push ( offsets. last ( ) . copied ( ) . unwrap_or_default ( ) ) ;
121- sizes. push ( S :: zero ( ) ) ;
137+ // For NULL lists, place them after the previous item's data to maintain the
138+ // no-overlap invariant for zero-copy to `ListArray` arrays.
139+ new_offsets. push ( n_elements) ;
140+ new_sizes. push ( S :: zero ( ) ) ;
122141 continue ;
123142 }
124143
125- let offset = offsets_canonical [ index] ;
126- let size = sizes_canonical [ index] ;
144+ let offset = offsets_slice [ index] ;
145+ let size = sizes_slice [ index] ;
127146
128147 let start = offset. as_ ( ) ;
129148 let stop = start + size. as_ ( ) ;
130149
131- chunks . push ( elements_canonical . slice ( start..stop ) ) ;
132- offsets . push ( n_elements ) ;
133- sizes . push ( size ) ;
150+ new_offsets . push ( n_elements ) ;
151+ new_sizes . push ( size ) ;
152+ new_elements_builder . extend_from_array ( & elements_canonical . slice ( start..stop ) ) ;
134153
135- n_elements += num_traits:: cast ( size) . vortex_expect ( "cast " ) ;
154+ n_elements += num_traits:: cast ( size) . vortex_expect ( "Cast failed " ) ;
136155 }
137156
138- let offsets = offsets. into_array ( ) ;
139- let sizes = sizes. into_array ( ) ;
157+ let offsets = new_offsets. into_array ( ) ;
158+ let sizes = new_sizes. into_array ( ) ;
159+ let elements = new_elements_builder. finish ( ) ;
140160
141- // SAFETY: all chunks were sliced from the same array so have same DType.
142- let elements =
143- unsafe { ChunkedArray :: new_unchecked ( chunks, element_dtype. as_ref ( ) . clone ( ) ) } ;
161+ debug_assert_eq ! (
162+ n_elements. as_( ) ,
163+ elements. len( ) ,
164+ "The accumulated elements somehow had the wrong length"
165+ ) ;
144166
145- // SAFETY: elements are contiguous, offsets and sizes hand-built to be zero copy
146- // to list.
167+ // SAFETY:
168+ // - All offsets are sequential and non-overlapping (`n_elements` tracks running total).
169+ // - Each `offset[i] + size[i]` equals `offset[i+1]` for all valid indices (including null
170+ // lists).
171+ // - All elements referenced by (offset, size) pairs exist within the new `elements` array.
172+ // - The validity array is preserved from the original array unchanged
173+ // - The array satisfies the zero-copy-to-list property by having sorted offsets, no gaps,
174+ // and no overlaps.
147175 unsafe {
148- ListViewArray :: new_unchecked (
149- elements. to_canonical ( ) . into_array ( ) ,
150- offsets,
151- sizes,
152- self . validity . clone ( ) ,
153- )
154- . with_zero_copy_to_list ( true )
176+ ListViewArray :: new_unchecked ( elements, offsets, sizes, self . validity . clone ( ) )
177+ . with_zero_copy_to_list ( true )
155178 }
156179 }
157180
@@ -342,4 +365,53 @@ mod tests {
342365 let all_elements = trimmed. elements ( ) . to_primitive ( ) ;
343366 assert_eq ! ( all_elements. scalar_at( 2 ) , 97i32 . into( ) ) ;
344367 }
368+
369+ #[ test]
370+ fn test_rebuild_with_trailing_nulls_regression ( ) {
371+ // Regression test for issue #5412
372+ // Tests that zero-copy-to-list arrays with trailing NULLs correctly calculate
373+ // offsets for NULL items to maintain no-overlap invariant
374+
375+ // Create a ListViewArray with trailing NULLs
376+ let elements = PrimitiveArray :: from_iter ( vec ! [ 1i32 , 2 , 3 , 4 ] ) . into_array ( ) ;
377+ let offsets = PrimitiveArray :: from_iter ( vec ! [ 0u32 , 2 , 0 , 0 ] ) . into_array ( ) ;
378+ let sizes = PrimitiveArray :: from_iter ( vec ! [ 2u32 , 2 , 0 , 0 ] ) . into_array ( ) ;
379+ let validity = Validity :: from_iter ( vec ! [ true , true , false , false ] ) ;
380+
381+ let listview = ListViewArray :: new ( elements, offsets, sizes, validity) ;
382+
383+ // First rebuild to make it zero-copy-to-list
384+ let rebuilt = listview. rebuild ( ListViewRebuildMode :: MakeZeroCopyToList ) ;
385+ assert ! ( rebuilt. is_zero_copy_to_list( ) ) ;
386+
387+ // Verify NULL items have correct offsets (should not reuse previous offsets)
388+ // After rebuild: offsets should be [0, 2, 4, 4] for zero-copy-to-list
389+ assert_eq ! ( rebuilt. offset_at( 0 ) , 0 ) ;
390+ assert_eq ! ( rebuilt. offset_at( 1 ) , 2 ) ;
391+ assert_eq ! ( rebuilt. offset_at( 2 ) , 4 ) ; // NULL should be at position 4
392+ assert_eq ! ( rebuilt. offset_at( 3 ) , 4 ) ; // Second NULL also at position 4
393+
394+ // All sizes should be correct
395+ assert_eq ! ( rebuilt. size_at( 0 ) , 2 ) ;
396+ assert_eq ! ( rebuilt. size_at( 1 ) , 2 ) ;
397+ assert_eq ! ( rebuilt. size_at( 2 ) , 0 ) ; // NULL has size 0
398+ assert_eq ! ( rebuilt. size_at( 3 ) , 0 ) ; // NULL has size 0
399+
400+ // Now rebuild with MakeExact (which calls naive_rebuild then trim_elements)
401+ // This should not panic (issue #5412)
402+ let exact = rebuilt. rebuild ( ListViewRebuildMode :: MakeExact ) ;
403+
404+ // Verify the result is still valid
405+ assert ! ( exact. is_valid( 0 ) ) ;
406+ assert ! ( exact. is_valid( 1 ) ) ;
407+ assert ! ( !exact. is_valid( 2 ) ) ;
408+ assert ! ( !exact. is_valid( 3 ) ) ;
409+
410+ // Verify data is preserved
411+ let list0 = exact. list_elements_at ( 0 ) . to_primitive ( ) ;
412+ assert_eq ! ( list0. as_slice:: <i32 >( ) , & [ 1 , 2 ] ) ;
413+
414+ let list1 = exact. list_elements_at ( 1 ) . to_primitive ( ) ;
415+ assert_eq ! ( list1. as_slice:: <i32 >( ) , & [ 3 , 4 ] ) ;
416+ }
345417}
0 commit comments