Skip to content

Commit 38989db

Browse files
committed
fix incorrect child vector size in duckdb list array import
Signed-off-by: Connor Tsui <[email protected]>
1 parent 47d6ba7 commit 38989db

File tree

1 file changed

+134
-15
lines changed

1 file changed

+134
-15
lines changed

vortex-duckdb/src/convert/vector.rs

Lines changed: 134 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -222,26 +222,44 @@ pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<Ar
222222
}
223223
DUCKDB_TYPE::DUCKDB_TYPE_LIST => {
224224
let mut offsets = BufferMut::with_capacity(len);
225-
let mut lengths = BufferMut::with_capacity(len);
226-
for duckdb_list_entry { offset, length } in
227-
vector.as_slice_with_len::<duckdb_list_entry>(len)
225+
let mut sizes = BufferMut::with_capacity(len);
226+
227+
let mut child_max_length = 0;
228+
for duckdb_list_entry {
229+
offset,
230+
length: size,
231+
} in vector.as_slice_with_len::<duckdb_list_entry>(len)
228232
{
233+
let offset = i64::try_from(*offset).vortex_expect("offset must fit i64");
234+
assert!(offset >= 0, "offset must be non-negative");
235+
let size = i64::try_from(*size).vortex_expect("length must fit i64");
236+
assert!(size >= 0, "size must be non-negative");
237+
238+
// TODO(connor): Figure out if we need to worry about validity. In Arrow, null list
239+
// views must still have a valid offset and size, but this might not be true is
240+
// DuckDB? The public docs don't say anything about this...
241+
242+
// We have to track the ends of all list views to determine the length of the child
243+
// vector since 1) they can be arbitrarily out of order and 2) there might be a null
244+
// list at the end of the vector.
245+
let end = usize::try_from(offset + size).vortex_expect(
246+
"either math is broken or the child vector length did not \
247+
fit into a 32-bit `usize` type",
248+
);
249+
child_max_length = child_max_length.max(end);
250+
251+
// SAFETY: We just allocated enough capacity for both of these.
229252
unsafe {
230-
offsets.push_unchecked(
231-
i64::try_from(*offset).vortex_expect("offset must fit i64"),
232-
);
233-
lengths.push_unchecked(
234-
i64::try_from(*length).vortex_expect("length must fit i64"),
235-
);
253+
offsets.push_unchecked(offset);
254+
sizes.push_unchecked(size);
236255
}
237256
}
257+
238258
let offsets = offsets.freeze();
239-
let lengths = lengths.freeze();
240-
let child_data = flat_vector_to_vortex(
241-
&mut vector.list_vector_get_child(),
242-
usize::try_from(offsets[len - 1] + lengths[len - 1])
243-
.vortex_expect("last offset and length sum must fit in usize "),
244-
)?;
259+
let lengths = sizes.freeze();
260+
261+
let child_data =
262+
flat_vector_to_vortex(&mut vector.list_vector_get_child(), child_max_length)?;
245263

246264
ListViewArray::try_new(
247265
child_data,
@@ -651,4 +669,105 @@ mod tests {
651669
&[5, 6, 7, 8]
652670
);
653671
}
672+
673+
#[test]
674+
fn test_list_with_trailing_null() {
675+
// Regression test: when the last list entry is null, its offset/length may be 0/0,
676+
// so we can't use the last entry to compute child vector length.
677+
let child_values = vec![1i32, 2, 3, 4];
678+
let len = 2;
679+
680+
let logical_type =
681+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER))
682+
.vortex_unwrap();
683+
let mut vector = Vector::with_capacity(logical_type, len);
684+
685+
// Entry 0: offset=0, length=4 -> all elements (end=4)
686+
// Entry 1: null, offset=0, length=0 (end=0)
687+
unsafe {
688+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
689+
entries[0] = duckdb_list_entry {
690+
offset: 0,
691+
length: child_values.len() as u64,
692+
};
693+
entries[1] = duckdb_list_entry {
694+
offset: 0,
695+
length: 0,
696+
};
697+
let mut child = vector.list_vector_get_child();
698+
let slice = child.as_slice_mut::<i32>(child_values.len());
699+
slice.copy_from_slice(&child_values);
700+
}
701+
702+
// Set the second entry as null.
703+
let validity_slice = unsafe { vector.ensure_validity_bitslice(len) };
704+
validity_slice.set(1, false);
705+
706+
// Test conversion - the old bug would compute child length as 0+0=0 instead of
707+
// max(4,0)=4.
708+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
709+
let vortex_array = result.to_listview();
710+
711+
assert_eq!(vortex_array.len(), len);
712+
assert_eq!(
713+
vortex_array
714+
.list_elements_at(0)
715+
.to_primitive()
716+
.as_slice::<i32>(),
717+
&[1, 2, 3, 4]
718+
);
719+
assert_eq!(vortex_array.validity_mask(), Mask::from_indices(2, vec![0]));
720+
}
721+
722+
#[test]
723+
fn test_list_out_of_order() {
724+
// Regression test: list views can be out of order in DuckDB. The child vector length
725+
// must be computed as the maximum end offset, not just the last entry's end offset.
726+
let child_values = vec![1i32, 2, 3, 4];
727+
let len = 2;
728+
729+
let logical_type =
730+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER))
731+
.vortex_unwrap();
732+
let mut vector = Vector::with_capacity(logical_type, len);
733+
734+
// Populate with out-of-order list entries:
735+
// - Entry 0: offset=2, length=2 -> elements [3, 4] (end=4)
736+
// - Entry 1: offset=0, length=2 -> elements [1, 2] (end=2)
737+
unsafe {
738+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
739+
entries[0] = duckdb_list_entry {
740+
offset: 2,
741+
length: 2,
742+
};
743+
entries[1] = duckdb_list_entry {
744+
offset: 0,
745+
length: 2,
746+
};
747+
let mut child = vector.list_vector_get_child();
748+
let slice = child.as_slice_mut::<i32>(child_values.len());
749+
slice.copy_from_slice(&child_values);
750+
}
751+
752+
// Test conversion - the old bug would compute child length as 0+2=2 instead of
753+
// max(4,2)=4.
754+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
755+
let vortex_array = result.to_listview();
756+
757+
assert_eq!(vortex_array.len(), len);
758+
assert_eq!(
759+
vortex_array
760+
.list_elements_at(0)
761+
.to_primitive()
762+
.as_slice::<i32>(),
763+
&[3, 4]
764+
);
765+
assert_eq!(
766+
vortex_array
767+
.list_elements_at(1)
768+
.to_primitive()
769+
.as_slice::<i32>(),
770+
&[1, 2]
771+
);
772+
}
654773
}

0 commit comments

Comments
 (0)