@@ -109,6 +109,106 @@ fn vector_as_string_blob(vector: &mut Vector, len: usize, dtype: DType) -> Array
109109 builder. finish ( )
110110}
111111
112+ /// Converts a valid [`duckdb_list_entry`] to `(offset, size)`, updating tracking state.
113+ ///
114+ /// Updates `child_min_length` with the maximum end offset seen so far, and `previous_end` with this
115+ /// entry's end offset (for use as the offset of subsequent null entries).
116+ ///
117+ /// Panics if the offset or size are negative or don't fit in the expected types.
118+ fn convert_valid_list_entry (
119+ entry : & duckdb_list_entry ,
120+ child_min_length : & mut usize ,
121+ previous_end : & mut i64 ,
122+ ) -> ( i64 , i64 ) {
123+ let offset = i64:: try_from ( entry. offset ) . vortex_expect ( "list offset must fit i64" ) ;
124+ assert ! ( offset >= 0 , "list offset must be non-negative" ) ;
125+ let size = i64:: try_from ( entry. length ) . vortex_expect ( "list size must fit i64" ) ;
126+ assert ! ( size >= 0 , "list size must be non-negative" ) ;
127+
128+ let end = usize:: try_from ( offset + size)
129+ . vortex_expect ( "child vector length did not fit into a 32-bit `usize` type" ) ;
130+
131+ * child_min_length = ( * child_min_length) . max ( end) ;
132+ * previous_end = offset + size;
133+
134+ ( offset, size)
135+ }
136+
137+ /// Processes DuckDB list entries with validity to produce Vortex-compatible `ListView` offsets and
138+ /// sizes.
139+ ///
140+ /// Returns `(offsets, sizes, child_min_length)` where `child_min_length` is the maximum end offset
141+ /// across all valid list entries, used to determine the child vector length coming from DuckDB.
142+ ///
143+ /// Null list views in DuckDB may contain garbage offset/size values (which is different from the
144+ /// Arrow specification), so we must check validity before reading them.
145+ ///
146+ /// For null entries, we set the offset to the previous list's end and size to 0 so that:
147+ /// 1. We don't accidentally read garbage data from the child vector.
148+ /// 2. The null entries remain in (mostly) sorted offset order, which can potentially simplify
149+ /// downstream operations like converting `ListView` to `List`.
150+ fn process_duckdb_lists (
151+ entries : & [ duckdb_list_entry ] ,
152+ validity : & Validity ,
153+ ) -> ( Buffer < i64 > , Buffer < i64 > , usize ) {
154+ let len = entries. len ( ) ;
155+ let mut offsets = BufferMut :: with_capacity ( len) ;
156+ let mut sizes = BufferMut :: with_capacity ( len) ;
157+
158+ match validity {
159+ Validity :: NonNullable | Validity :: AllValid => {
160+ // All entries are valid, so there is no need to check the validity.
161+ let mut child_min_length = 0 ;
162+ let mut previous_end = 0 ;
163+ for entry in entries {
164+ let ( offset, size) =
165+ convert_valid_list_entry ( entry, & mut child_min_length, & mut previous_end) ;
166+
167+ // SAFETY: We allocated enough capacity above.
168+ unsafe {
169+ offsets. push_unchecked ( offset) ;
170+ sizes. push_unchecked ( size) ;
171+ }
172+ }
173+ ( offsets. freeze ( ) , sizes. freeze ( ) , child_min_length)
174+ }
175+ Validity :: AllInvalid => {
176+ // All entries are null, so we can just set offset=0 and size=0.
177+ // SAFETY: We allocated enough capacity above.
178+ unsafe {
179+ offsets. push_n_unchecked ( 0 , len) ;
180+ sizes. push_n_unchecked ( 0 , len) ;
181+ }
182+ ( offsets. freeze ( ) , sizes. freeze ( ) , 0 )
183+ }
184+ Validity :: Array ( _) => {
185+ // We have some number of nulls, so make sure to check validity before updating info.
186+ let mask = validity. to_mask ( len) ;
187+ let child_min_length = mask. iter_bools ( |validity_iter| {
188+ let mut child_min_length = 0 ;
189+ let mut previous_end = 0 ;
190+
191+ for ( entry, is_valid) in entries. iter ( ) . zip ( validity_iter) {
192+ let ( offset, size) = if is_valid {
193+ convert_valid_list_entry ( entry, & mut child_min_length, & mut previous_end)
194+ } else {
195+ ( previous_end, 0 )
196+ } ;
197+
198+ // SAFETY: We allocated enough capacity above.
199+ unsafe {
200+ offsets. push_unchecked ( offset) ;
201+ sizes. push_unchecked ( size) ;
202+ }
203+ }
204+
205+ child_min_length
206+ } ) ;
207+ ( offsets. freeze ( ) , sizes. freeze ( ) , child_min_length)
208+ }
209+ }
210+ }
211+
112212/// Converts flat vector to a vortex array
113213pub fn flat_vector_to_vortex ( vector : & mut Vector , len : usize ) -> VortexResult < ArrayRef > {
114214 let type_id = vector. logical_type ( ) . as_type_id ( ) ;
@@ -221,33 +321,18 @@ pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<Ar
221321 . map ( |a| a. into_array ( ) )
222322 }
223323 DUCKDB_TYPE :: DUCKDB_TYPE_LIST => {
224- let mut offsets = BufferMut :: with_capacity ( len) ;
225- let mut lengths = BufferMut :: with_capacity ( len) ;
226- for duckdb_list_entry { offset, length } in
227- vector. as_slice_with_len :: < duckdb_list_entry > ( len)
228- {
229- unsafe {
230- offsets. push_unchecked (
231- i64:: try_from ( * offset) . vortex_expect ( "list offset must fit in i64" ) ,
232- ) ;
233- lengths. push_unchecked (
234- i64:: try_from ( * length) . vortex_expect ( "list length must fit in i64" ) ,
235- ) ;
236- }
237- }
238- let offsets = offsets. freeze ( ) ;
239- let lengths = lengths. freeze ( ) ;
240- let child_data = flat_vector_to_vortex (
241- & mut vector. list_vector_get_child ( ) ,
242- usize:: try_from ( offsets[ len - 1 ] + lengths[ len - 1 ] )
243- . vortex_expect ( "list child data size must fit in usize" ) ,
244- ) ?;
324+ let validity = vector. validity_ref ( len) . to_validity ( ) ;
325+ let entries = vector. as_slice_with_len :: < duckdb_list_entry > ( len) ;
326+
327+ let ( offsets, sizes, child_min_length) = process_duckdb_lists ( entries, & validity) ;
328+ let child_data =
329+ flat_vector_to_vortex ( & mut vector. list_vector_get_child ( ) , child_min_length) ?;
245330
246331 ListViewArray :: try_new (
247332 child_data,
248333 offsets. into_array ( ) ,
249- lengths . into_array ( ) ,
250- vector . validity_ref ( len ) . to_validity ( ) ,
334+ sizes . into_array ( ) ,
335+ validity ,
251336 )
252337 . map ( |a| a. into_array ( ) )
253338 }
@@ -652,4 +737,177 @@ mod tests {
652737 & [ 5 , 6 , 7 , 8 ]
653738 ) ;
654739 }
740+
741+ #[ test]
742+ fn test_list_with_trailing_null ( ) {
743+ // Regression test: when the last list entry is null, its offset/length may be 0/0,
744+ // so we can't use the last entry to compute child vector length.
745+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
746+ let len = 2 ;
747+
748+ let logical_type =
749+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) ) . unwrap ( ) ;
750+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
751+
752+ // Entry 0: offset=0, length=4 -> all elements (end=4)
753+ // Entry 1: null, offset=0, length=0 (end=0)
754+ unsafe {
755+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
756+ entries[ 0 ] = duckdb_list_entry {
757+ offset : 0 ,
758+ length : child_values. len ( ) as u64 ,
759+ } ;
760+ entries[ 1 ] = duckdb_list_entry {
761+ offset : 0 ,
762+ length : 0 ,
763+ } ;
764+ let mut child = vector. list_vector_get_child ( ) ;
765+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
766+ slice. copy_from_slice ( & child_values) ;
767+ }
768+
769+ // Set the second entry as null.
770+ let validity_slice = unsafe { vector. ensure_validity_bitslice ( len) } ;
771+ validity_slice. set ( 1 , false ) ;
772+
773+ // Test conversion - the old bug would compute child length as 0+0=0 instead of
774+ // max(4,0)=4.
775+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
776+ let vortex_array = result. to_listview ( ) ;
777+
778+ assert_eq ! ( vortex_array. len( ) , len) ;
779+ assert_eq ! (
780+ vortex_array
781+ . list_elements_at( 0 )
782+ . to_primitive( )
783+ . as_slice:: <i32 >( ) ,
784+ & [ 1 , 2 , 3 , 4 ]
785+ ) ;
786+ assert_eq ! ( vortex_array. validity_mask( ) , Mask :: from_indices( 2 , vec![ 0 ] ) ) ;
787+ }
788+
789+ #[ test]
790+ fn test_list_out_of_order ( ) {
791+ // Regression test: list views can be out of order in DuckDB. The child vector length
792+ // must be computed as the maximum end offset, not just the last entry's end offset.
793+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
794+ let len = 2 ;
795+
796+ let logical_type =
797+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) ) . unwrap ( ) ;
798+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
799+
800+ // Populate with out-of-order list entries:
801+ // - Entry 0: offset=2, length=2 -> elements [3, 4] (end=4)
802+ // - Entry 1: offset=0, length=2 -> elements [1, 2] (end=2)
803+ unsafe {
804+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
805+ entries[ 0 ] = duckdb_list_entry {
806+ offset : 2 ,
807+ length : 2 ,
808+ } ;
809+ entries[ 1 ] = duckdb_list_entry {
810+ offset : 0 ,
811+ length : 2 ,
812+ } ;
813+ let mut child = vector. list_vector_get_child ( ) ;
814+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
815+ slice. copy_from_slice ( & child_values) ;
816+ }
817+
818+ // Test conversion - the old bug would compute child length as 0+2=2 instead of
819+ // max(4,2)=4.
820+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
821+ let vortex_array = result. to_listview ( ) ;
822+
823+ assert_eq ! ( vortex_array. len( ) , len) ;
824+ assert_eq ! (
825+ vortex_array
826+ . list_elements_at( 0 )
827+ . to_primitive( )
828+ . as_slice:: <i32 >( ) ,
829+ & [ 3 , 4 ]
830+ ) ;
831+ assert_eq ! (
832+ vortex_array
833+ . list_elements_at( 1 )
834+ . to_primitive( )
835+ . as_slice:: <i32 >( ) ,
836+ & [ 1 , 2 ]
837+ ) ;
838+ }
839+
840+ #[ test]
841+ fn test_list_null_garbage_data ( ) {
842+ // Test that null list entries with garbage offset/size values don't cause issues.
843+ // DuckDB doesn't guarantee valid offset/size for null list views, so we must check
844+ // validity before reading the offset/size values.
845+ let child_values = vec ! [ 1i32 , 2 , 3 , 4 ] ;
846+ let len = 3 ;
847+
848+ let logical_type =
849+ LogicalType :: list_type ( LogicalType :: new ( DUCKDB_TYPE :: DUCKDB_TYPE_INTEGER ) ) . unwrap ( ) ;
850+ let mut vector = Vector :: with_capacity ( logical_type, len) ;
851+
852+ // Entry 0: valid, offset=0, length=2 -> elements [1, 2]
853+ // Entry 1: null with garbage values (offset=9999, length=9999)
854+ // Entry 2: valid, offset=2, length=2 -> elements [3, 4]
855+ unsafe {
856+ let entries = vector. as_slice_mut :: < duckdb_list_entry > ( len) ;
857+ entries[ 0 ] = duckdb_list_entry {
858+ offset : 0 ,
859+ length : 2 ,
860+ } ;
861+ // Garbage values that would cause a panic if we tried to use them.
862+ entries[ 1 ] = duckdb_list_entry {
863+ offset : 9999 ,
864+ length : 9999 ,
865+ } ;
866+ entries[ 2 ] = duckdb_list_entry {
867+ offset : 2 ,
868+ length : 2 ,
869+ } ;
870+ let mut child = vector. list_vector_get_child ( ) ;
871+ let slice = child. as_slice_mut :: < i32 > ( child_values. len ( ) ) ;
872+ slice. copy_from_slice ( & child_values) ;
873+ }
874+
875+ // Set entry 1 as null.
876+ let validity_slice = unsafe { vector. ensure_validity_bitslice ( len) } ;
877+ validity_slice. set ( 1 , false ) ;
878+
879+ // Test conversion. The old code would compute child_min_length as 9999+9999=19998, which
880+ // would panic when trying to read that much data from the child vector.
881+ let result = flat_vector_to_vortex ( & mut vector, len) . unwrap ( ) ;
882+ let vortex_array = result. to_listview ( ) ;
883+
884+ assert_eq ! ( vortex_array. len( ) , len) ;
885+
886+ // Valid entries should work correctly.
887+ assert_eq ! (
888+ vortex_array
889+ . list_elements_at( 0 )
890+ . to_primitive( )
891+ . as_slice:: <i32 >( ) ,
892+ & [ 1 , 2 ]
893+ ) ;
894+ assert_eq ! (
895+ vortex_array
896+ . list_elements_at( 2 )
897+ . to_primitive( )
898+ . as_slice:: <i32 >( ) ,
899+ & [ 3 , 4 ]
900+ ) ;
901+
902+ // Verify the null entry has sanitized offset/size (offset=2, size=0) rather than garbage.
903+ let offsets = vortex_array. offsets ( ) . to_primitive ( ) ;
904+ let sizes = vortex_array. sizes ( ) . to_primitive ( ) ;
905+ assert_eq ! ( offsets. as_slice:: <i64 >( ) [ 1 ] , 2 ) ; // Previous end (0+2).
906+ assert_eq ! ( sizes. as_slice:: <i64 >( ) [ 1 ] , 0 ) ;
907+
908+ assert_eq ! (
909+ vortex_array. validity_mask( ) ,
910+ Mask :: from_indices( 3 , vec![ 0 , 2 ] )
911+ ) ;
912+ }
655913}
0 commit comments