@@ -109,6 +109,106 @@ fn vector_as_string_blob(vector: &mut Vector, len: usize, dtype: DType) -> Array
109109 builder. finish ( )
110110}
111111
112+ /// Converts a valid [`duckdb_list_entry`] to `(offset, size)`, updating tracking state.
113+ ///
114+ /// Updates `child_max_length` with the maximum end offset seen so far, and `previous_end` with this
115+ /// entry's end offset (for use as the offset of subsequent null entries).
116+ ///
117+ /// Panics if the offset or size are negative or don't fit in the expected types.
118+ fn convert_valid_list_entry (
119+ entry : & duckdb_list_entry ,
120+ child_max_length : & mut usize ,
121+ previous_end : & mut i64 ,
122+ ) -> ( i64 , i64 ) {
123+ let offset = i64:: try_from ( entry. offset ) . vortex_expect ( "offset must fit i64" ) ;
124+ assert ! ( offset >= 0 , "offset must be non-negative" ) ;
125+ let size = i64:: try_from ( entry. length ) . vortex_expect ( "length must fit i64" ) ;
126+ assert ! ( size >= 0 , "size must be non-negative" ) ;
127+
128+ let end = usize:: try_from ( offset + size)
129+ . vortex_expect ( "child vector length did not fit into a 32-bit `usize` type" ) ;
130+
131+ * child_max_length = ( * child_max_length) . max ( end) ;
132+ * previous_end = offset + size;
133+
134+ ( offset, size)
135+ }
136+
137+ /// Processes DuckDB list entries with validity to produce Vortex-compatible `ListView` offsets and
138+ /// sizes.
139+ ///
140+ /// Returns `(offsets, sizes, child_max_length)` where `child_max_length` is the maximum end offset
141+ /// across all valid list entries, used to determine the child vector length coming from DuckDB.
142+ ///
143+ /// Null list views in DuckDB may contain garbage offset/size values (which is different from the
144+ /// Arrow specification), so we must check validity before reading them.
145+ ///
146+ /// For null entries, we set the offset to the previous list's end and size to 0 so that:
147+ /// 1. We don't accidentally read garbage data from the child vector.
148+ /// 2. The null entries remain in (mostly) sorted offset order, which can potentially simplify
149+ /// downstream operations like converting `ListView` to `List`.
150+ fn process_duckdb_lists (
151+ entries : & [ duckdb_list_entry ] ,
152+ validity : & Validity ,
153+ ) -> ( Buffer < i64 > , Buffer < i64 > , usize ) {
154+ let len = entries. len ( ) ;
155+ let mut offsets = BufferMut :: with_capacity ( len) ;
156+ let mut sizes = BufferMut :: with_capacity ( len) ;
157+
158+ match validity {
159+ Validity :: NonNullable | Validity :: AllValid => {
160+ // All entries are valid, so there is no need to check the validity.
161+ let mut child_max_length = 0 ;
162+ let mut previous_end = 0 ;
163+ for entry in entries {
164+ let ( offset, size) =
165+ convert_valid_list_entry ( entry, & mut child_max_length, & mut previous_end) ;
166+
167+ // SAFETY: We allocated enough capacity above.
168+ unsafe {
169+ offsets. push_unchecked ( offset) ;
170+ sizes. push_unchecked ( size) ;
171+ }
172+ }
173+ ( offsets. freeze ( ) , sizes. freeze ( ) , child_max_length)
174+ }
175+ Validity :: AllInvalid => {
176+ // All entries are null, so we can just set offset=0 and size=0.
177+ // SAFETY: We allocated enough capacity above.
178+ unsafe {
179+ offsets. push_n_unchecked ( 0 , len) ;
180+ sizes. push_n_unchecked ( 0 , len) ;
181+ }
182+ ( offsets. freeze ( ) , sizes. freeze ( ) , 0 )
183+ }
184+ Validity :: Array ( _) => {
185+ // We have some number of nulls, so make sure to check validity before updating info.
186+ let mask = validity. to_mask ( len) ;
187+ let child_max_length = mask. iter_bools ( |validity_iter| {
188+ let mut child_max_length = 0 ;
189+ let mut previous_end = 0 ;
190+
191+ for ( entry, is_valid) in entries. iter ( ) . zip ( validity_iter) {
192+ let ( offset, size) = if is_valid {
193+ convert_valid_list_entry ( entry, & mut child_max_length, & mut previous_end)
194+ } else {
195+ ( previous_end, 0 )
196+ } ;
197+
198+ // SAFETY: We allocated enough capacity above.
199+ unsafe {
200+ offsets. push_unchecked ( offset) ;
201+ sizes. push_unchecked ( size) ;
202+ }
203+ }
204+
205+ child_max_length
206+ } ) ;
207+ ( offsets. freeze ( ) , sizes. freeze ( ) , child_max_length)
208+ }
209+ }
210+ }
211+
112212/// Converts flat vector to a vortex array
113213pub fn flat_vector_to_vortex ( vector : & mut Vector , len : usize ) -> VortexResult < ArrayRef > {
114214 let type_id = vector. logical_type ( ) . as_type_id ( ) ;
@@ -221,33 +321,18 @@ pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<Ar
221321 . map ( |a| a. into_array ( ) )
222322 }
223323 DUCKDB_TYPE :: DUCKDB_TYPE_LIST => {
224- let mut offsets = BufferMut :: with_capacity ( len) ;
225- let mut lengths = BufferMut :: with_capacity ( len) ;
226- for duckdb_list_entry { offset, length } in
227- vector. as_slice_with_len :: < duckdb_list_entry > ( len)
228- {
229- unsafe {
230- offsets. push_unchecked (
231- i64:: try_from ( * offset) . vortex_expect ( "offset must fit i64" ) ,
232- ) ;
233- lengths. push_unchecked (
234- i64:: try_from ( * length) . vortex_expect ( "length must fit i64" ) ,
235- ) ;
236- }
237- }
238- let offsets = offsets. freeze ( ) ;
239- let lengths = lengths. freeze ( ) ;
240- let child_data = flat_vector_to_vortex (
241- & mut vector. list_vector_get_child ( ) ,
242- usize:: try_from ( offsets[ len - 1 ] + lengths[ len - 1 ] )
243- . vortex_expect ( "last offset and length sum must fit in usize " ) ,
244- ) ?;
324+ let validity = vector. validity_ref ( len) . to_validity ( ) ;
325+ let entries = vector. as_slice_with_len :: < duckdb_list_entry > ( len) ;
326+
327+ let ( offsets, sizes, child_max_length) = process_duckdb_lists ( entries, & validity) ;
328+ let child_data =
329+ flat_vector_to_vortex ( & mut vector. list_vector_get_child ( ) , child_max_length) ?;
245330
246331 ListViewArray :: try_new (
247332 child_data,
248333 offsets. into_array ( ) ,
249- lengths . into_array ( ) ,
250- vector . validity_ref ( len ) . to_validity ( ) ,
334+ sizes . into_array ( ) ,
335+ validity ,
251336 )
252337 . map ( |a| a. into_array ( ) )
253338 }
@@ -651,4 +736,180 @@ mod tests {
651736 & [ 5 , 6 , 7 , 8 ]
652737 ) ;
653738 }
739+
740+ #[ test]
741+ fn test_list_with_trailing_null ( ) {
742+ // Regression test: when the last list entry is null, its offset/length may be 0/0,
743+ // so we can't use the last entry to compute child vector length.
744+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
745+ let len = 2 ;
746+
747+ let logical_type =
748+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) )
749+ . vortex_unwrap ( ) ;
750+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
751+
752+ // Entry 0: offset=0, length=4 -> all elements (end=4)
753+ // Entry 1: null, offset=0, length=0 (end=0)
754+ unsafe {
755+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
756+ entries[ 0 ] = duckdb_list_entry {
757+ offset : 0 ,
758+ length : child_values. len ( ) as u64 ,
759+ } ;
760+ entries[ 1 ] = duckdb_list_entry {
761+ offset : 0 ,
762+ length : 0 ,
763+ } ;
764+ let mut child = vector. list_vector_get_child ( ) ;
765+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
766+ slice. copy_from_slice ( & child_values) ;
767+ }
768+
769+ // Set the second entry as null.
770+ let validity_slice = unsafe { vector. ensure_validity_bitslice ( len) } ;
771+ validity_slice. set ( 1 , false ) ;
772+
773+ // Test conversion - the old bug would compute child length as 0+0=0 instead of
774+ // max(4,0)=4.
775+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
776+ let vortex_array = result. to_listview ( ) ;
777+
778+ assert_eq ! ( vortex_array. len( ) , len) ;
779+ assert_eq ! (
780+ vortex_array
781+ . list_elements_at( 0 )
782+ . to_primitive( )
783+ . as_slice:: <i32 >( ) ,
784+ & [ 1 , 2 , 3 , 4 ]
785+ ) ;
786+ assert_eq ! ( vortex_array. validity_mask( ) , Mask :: from_indices( 2 , vec![ 0 ] ) ) ;
787+ }
788+
789+ #[ test]
790+ fn test_list_out_of_order ( ) {
791+ // Regression test: list views can be out of order in DuckDB. The child vector length
792+ // must be computed as the maximum end offset, not just the last entry's end offset.
793+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
794+ let len = 2 ;
795+
796+ let logical_type =
797+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) )
798+ . vortex_unwrap ( ) ;
799+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
800+
801+ // Populate with out-of-order list entries:
802+ // - Entry 0: offset=2, length=2 -> elements [3, 4] (end=4)
803+ // - Entry 1: offset=0, length=2 -> elements [1, 2] (end=2)
804+ unsafe {
805+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
806+ entries[ 0 ] = duckdb_list_entry {
807+ offset : 2 ,
808+ length : 2 ,
809+ } ;
810+ entries[ 1 ] = duckdb_list_entry {
811+ offset : 0 ,
812+ length : 2 ,
813+ } ;
814+ let mut child = vector. list_vector_get_child ( ) ;
815+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
816+ slice. copy_from_slice ( & child_values) ;
817+ }
818+
819+ // Test conversion - the old bug would compute child length as 0+2=2 instead of
820+ // max(4,2)=4.
821+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
822+ let vortex_array = result. to_listview ( ) ;
823+
824+ assert_eq ! ( vortex_array. len( ) , len) ;
825+ assert_eq ! (
826+ vortex_array
827+ . list_elements_at( 0 )
828+ . to_primitive( )
829+ . as_slice:: <i32 >( ) ,
830+ & [ 3 , 4 ]
831+ ) ;
832+ assert_eq ! (
833+ vortex_array
834+ . list_elements_at( 1 )
835+ . to_primitive( )
836+ . as_slice:: <i32 >( ) ,
837+ & [ 1 , 2 ]
838+ ) ;
839+ }
840+
841+ #[ test]
842+ fn test_list_null_garbage_data ( ) {
843+ // Test that null list entries with garbage offset/size values don't cause issues.
844+ // DuckDB doesn't guarantee valid offset/size for null list views, so we must check
845+ // validity before reading the offset/size values.
846+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
847+ let len = 3 ;
848+
849+ let logical_type =
850+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) )
851+ . vortex_unwrap ( ) ;
852+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
853+
854+ // Entry 0: valid, offset=0, length=2 -> elements [1, 2]
855+ // Entry 1: null with garbage values (offset=9999, length=9999)
856+ // Entry 2: valid, offset=2, length=2 -> elements [3, 4]
857+ unsafe {
858+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
859+ entries[ 0 ] = duckdb_list_entry {
860+ offset : 0 ,
861+ length : 2 ,
862+ } ;
863+ // Garbage values that would cause a panic if we tried to use them.
864+ entries[ 1 ] = duckdb_list_entry {
865+ offset : 9999 ,
866+ length : 9999 ,
867+ } ;
868+ entries[ 2 ] = duckdb_list_entry {
869+ offset : 2 ,
870+ length : 2 ,
871+ } ;
872+ let mut child = vector. list_vector_get_child ( ) ;
873+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
874+ slice. copy_from_slice ( & child_values) ;
875+ }
876+
877+ // Set entry 1 as null.
878+ let validity_slice = unsafe { vector. ensure_validity_bitslice ( len) } ;
879+ validity_slice. set ( 1 , false ) ;
880+
881+ // Test conversion. The old code would compute child_max_length as 9999+9999=19998, which
882+ // would panic when trying to read that much data from the child vector.
883+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
884+ let vortex_array = result. to_listview ( ) ;
885+
886+ assert_eq ! ( vortex_array. len( ) , len) ;
887+
888+ // Valid entries should work correctly.
889+ assert_eq ! (
890+ vortex_array
891+ . list_elements_at( 0 )
892+ . to_primitive( )
893+ . as_slice:: <i32 >( ) ,
894+ & [ 1 , 2 ]
895+ ) ;
896+ assert_eq ! (
897+ vortex_array
898+ . list_elements_at( 2 )
899+ . to_primitive( )
900+ . as_slice:: <i32 >( ) ,
901+ & [ 3 , 4 ]
902+ ) ;
903+
904+ // Verify the null entry has sanitized offset/size (offset=2, size=0) rather than garbage.
905+ let offsets = vortex_array. offsets ( ) . to_primitive ( ) ;
906+ let sizes = vortex_array. sizes ( ) . to_primitive ( ) ;
907+ assert_eq ! ( offsets. as_slice:: <i64 >( ) [ 1 ] , 2 ) ; // Previous end (0+2).
908+ assert_eq ! ( sizes. as_slice:: <i64 >( ) [ 1 ] , 0 ) ;
909+
910+ assert_eq ! (
911+ vortex_array. validity_mask( ) ,
912+ Mask :: from_indices( 3 , vec![ 0 , 2 ] )
913+ ) ;
914+ }
654915}
0 commit comments