Skip to content

Commit b3aa9d7

Browse files
authored
Fix: incorrect child vector size in duckdb list array import (#5740)
Fixes #5597 Since list views can be stored out of order and because there can be nulls at the end of the vector, we have to calculate the length of the child vector by looking at all views in the vector. Note that this does not take validity into account, as I have no idea if DuckDB allows for complete garbage in their list views if they are null. At least in Arrow, that is not allowed. Also adds 2 regression tests. Signed-off-by: Connor Tsui <[email protected]>
1 parent 87c9d07 commit b3aa9d7

File tree

2 files changed

+282
-23
lines changed

2 files changed

+282
-23
lines changed

vortex-duckdb/src/convert/vector.rs

Lines changed: 281 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,106 @@ fn vector_as_string_blob(vector: &mut Vector, len: usize, dtype: DType) -> Array
109109
builder.finish()
110110
}
111111

112+
/// Converts a valid [`duckdb_list_entry`] to `(offset, size)`, updating tracking state.
113+
///
114+
/// Updates `child_min_length` with the maximum end offset seen so far, and `previous_end` with this
115+
/// entry's end offset (for use as the offset of subsequent null entries).
116+
///
117+
/// Panics if the offset or size are negative or don't fit in the expected types.
118+
fn convert_valid_list_entry(
119+
entry: &duckdb_list_entry,
120+
child_min_length: &mut usize,
121+
previous_end: &mut i64,
122+
) -> (i64, i64) {
123+
let offset = i64::try_from(entry.offset).vortex_expect("list offset must fit i64");
124+
assert!(offset >= 0, "list offset must be non-negative");
125+
let size = i64::try_from(entry.length).vortex_expect("list size must fit i64");
126+
assert!(size >= 0, "list size must be non-negative");
127+
128+
let end = usize::try_from(offset + size)
129+
.vortex_expect("child vector length did not fit into a 32-bit `usize` type");
130+
131+
*child_min_length = (*child_min_length).max(end);
132+
*previous_end = offset + size;
133+
134+
(offset, size)
135+
}
136+
137+
/// Processes DuckDB list entries with validity to produce Vortex-compatible `ListView` offsets and
138+
/// sizes.
139+
///
140+
/// Returns `(offsets, sizes, child_min_length)` where `child_min_length` is the maximum end offset
141+
/// across all valid list entries, used to determine the child vector length coming from DuckDB.
142+
///
143+
/// Null list views in DuckDB may contain garbage offset/size values (which is different from the
144+
/// Arrow specification), so we must check validity before reading them.
145+
///
146+
/// For null entries, we set the offset to the previous list's end and size to 0 so that:
147+
/// 1. We don't accidentally read garbage data from the child vector.
148+
/// 2. The null entries remain in (mostly) sorted offset order, which can potentially simplify
149+
/// downstream operations like converting `ListView` to `List`.
150+
fn process_duckdb_lists(
151+
entries: &[duckdb_list_entry],
152+
validity: &Validity,
153+
) -> (Buffer<i64>, Buffer<i64>, usize) {
154+
let len = entries.len();
155+
let mut offsets = BufferMut::with_capacity(len);
156+
let mut sizes = BufferMut::with_capacity(len);
157+
158+
match validity {
159+
Validity::NonNullable | Validity::AllValid => {
160+
// All entries are valid, so there is no need to check the validity.
161+
let mut child_min_length = 0;
162+
let mut previous_end = 0;
163+
for entry in entries {
164+
let (offset, size) =
165+
convert_valid_list_entry(entry, &mut child_min_length, &mut previous_end);
166+
167+
// SAFETY: We allocated enough capacity above.
168+
unsafe {
169+
offsets.push_unchecked(offset);
170+
sizes.push_unchecked(size);
171+
}
172+
}
173+
(offsets.freeze(), sizes.freeze(), child_min_length)
174+
}
175+
Validity::AllInvalid => {
176+
// All entries are null, so we can just set offset=0 and size=0.
177+
// SAFETY: We allocated enough capacity above.
178+
unsafe {
179+
offsets.push_n_unchecked(0, len);
180+
sizes.push_n_unchecked(0, len);
181+
}
182+
(offsets.freeze(), sizes.freeze(), 0)
183+
}
184+
Validity::Array(_) => {
185+
// We have some number of nulls, so make sure to check validity before updating info.
186+
let mask = validity.to_mask(len);
187+
let child_min_length = mask.iter_bools(|validity_iter| {
188+
let mut child_min_length = 0;
189+
let mut previous_end = 0;
190+
191+
for (entry, is_valid) in entries.iter().zip(validity_iter) {
192+
let (offset, size) = if is_valid {
193+
convert_valid_list_entry(entry, &mut child_min_length, &mut previous_end)
194+
} else {
195+
(previous_end, 0)
196+
};
197+
198+
// SAFETY: We allocated enough capacity above.
199+
unsafe {
200+
offsets.push_unchecked(offset);
201+
sizes.push_unchecked(size);
202+
}
203+
}
204+
205+
child_min_length
206+
});
207+
(offsets.freeze(), sizes.freeze(), child_min_length)
208+
}
209+
}
210+
}
211+
112212
/// Converts flat vector to a vortex array
113213
pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<ArrayRef> {
114214
let type_id = vector.logical_type().as_type_id();
@@ -221,33 +321,18 @@ pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<Ar
221321
.map(|a| a.into_array())
222322
}
223323
DUCKDB_TYPE::DUCKDB_TYPE_LIST => {
224-
let mut offsets = BufferMut::with_capacity(len);
225-
let mut lengths = BufferMut::with_capacity(len);
226-
for duckdb_list_entry { offset, length } in
227-
vector.as_slice_with_len::<duckdb_list_entry>(len)
228-
{
229-
unsafe {
230-
offsets.push_unchecked(
231-
i64::try_from(*offset).vortex_expect("list offset must fit in i64"),
232-
);
233-
lengths.push_unchecked(
234-
i64::try_from(*length).vortex_expect("list length must fit in i64"),
235-
);
236-
}
237-
}
238-
let offsets = offsets.freeze();
239-
let lengths = lengths.freeze();
240-
let child_data = flat_vector_to_vortex(
241-
&mut vector.list_vector_get_child(),
242-
usize::try_from(offsets[len - 1] + lengths[len - 1])
243-
.vortex_expect("list child data size must fit in usize"),
244-
)?;
324+
let validity = vector.validity_ref(len).to_validity();
325+
let entries = vector.as_slice_with_len::<duckdb_list_entry>(len);
326+
327+
let (offsets, sizes, child_min_length) = process_duckdb_lists(entries, &validity);
328+
let child_data =
329+
flat_vector_to_vortex(&mut vector.list_vector_get_child(), child_min_length)?;
245330

246331
ListViewArray::try_new(
247332
child_data,
248333
offsets.into_array(),
249-
lengths.into_array(),
250-
vector.validity_ref(len).to_validity(),
334+
sizes.into_array(),
335+
validity,
251336
)
252337
.map(|a| a.into_array())
253338
}
@@ -652,4 +737,177 @@ mod tests {
652737
&[5, 6, 7, 8]
653738
);
654739
}
740+
741+
#[test]
742+
fn test_list_with_trailing_null() {
743+
// Regression test: when the last list entry is null, its offset/length may be 0/0,
744+
// so we can't use the last entry to compute child vector length.
745+
let child_values = vec![1i32, 2, 3, 4];
746+
let len = 2;
747+
748+
let logical_type =
749+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER)).unwrap();
750+
let mut vector = Vector::with_capacity(logical_type, len);
751+
752+
// Entry 0: offset=0, length=4 -> all elements (end=4)
753+
// Entry 1: null, offset=0, length=0 (end=0)
754+
unsafe {
755+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
756+
entries[0] = duckdb_list_entry {
757+
offset: 0,
758+
length: child_values.len() as u64,
759+
};
760+
entries[1] = duckdb_list_entry {
761+
offset: 0,
762+
length: 0,
763+
};
764+
let mut child = vector.list_vector_get_child();
765+
let slice = child.as_slice_mut::<i32>(child_values.len());
766+
slice.copy_from_slice(&child_values);
767+
}
768+
769+
// Set the second entry as null.
770+
let validity_slice = unsafe { vector.ensure_validity_bitslice(len) };
771+
validity_slice.set(1, false);
772+
773+
// Test conversion - the old bug would compute child length as 0+0=0 instead of
774+
// max(4,0)=4.
775+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
776+
let vortex_array = result.to_listview();
777+
778+
assert_eq!(vortex_array.len(), len);
779+
assert_eq!(
780+
vortex_array
781+
.list_elements_at(0)
782+
.to_primitive()
783+
.as_slice::<i32>(),
784+
&[1, 2, 3, 4]
785+
);
786+
assert_eq!(vortex_array.validity_mask(), Mask::from_indices(2, vec![0]));
787+
}
788+
789+
#[test]
790+
fn test_list_out_of_order() {
791+
// Regression test: list views can be out of order in DuckDB. The child vector length
792+
// must be computed as the maximum end offset, not just the last entry's end offset.
793+
let child_values = vec![1i32, 2, 3, 4];
794+
let len = 2;
795+
796+
let logical_type =
797+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER)).unwrap();
798+
let mut vector = Vector::with_capacity(logical_type, len);
799+
800+
// Populate with out-of-order list entries:
801+
// - Entry 0: offset=2, length=2 -> elements [3, 4] (end=4)
802+
// - Entry 1: offset=0, length=2 -> elements [1, 2] (end=2)
803+
unsafe {
804+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
805+
entries[0] = duckdb_list_entry {
806+
offset: 2,
807+
length: 2,
808+
};
809+
entries[1] = duckdb_list_entry {
810+
offset: 0,
811+
length: 2,
812+
};
813+
let mut child = vector.list_vector_get_child();
814+
let slice = child.as_slice_mut::<i32>(child_values.len());
815+
slice.copy_from_slice(&child_values);
816+
}
817+
818+
// Test conversion - the old bug would compute child length as 0+2=2 instead of
819+
// max(4,2)=4.
820+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
821+
let vortex_array = result.to_listview();
822+
823+
assert_eq!(vortex_array.len(), len);
824+
assert_eq!(
825+
vortex_array
826+
.list_elements_at(0)
827+
.to_primitive()
828+
.as_slice::<i32>(),
829+
&[3, 4]
830+
);
831+
assert_eq!(
832+
vortex_array
833+
.list_elements_at(1)
834+
.to_primitive()
835+
.as_slice::<i32>(),
836+
&[1, 2]
837+
);
838+
}
839+
840+
#[test]
841+
fn test_list_null_garbage_data() {
842+
// Test that null list entries with garbage offset/size values don't cause issues.
843+
// DuckDB doesn't guarantee valid offset/size for null list views, so we must check
844+
// validity before reading the offset/size values.
845+
let child_values = vec![1i32, 2, 3, 4];
846+
let len = 3;
847+
848+
let logical_type =
849+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER)).unwrap();
850+
let mut vector = Vector::with_capacity(logical_type, len);
851+
852+
// Entry 0: valid, offset=0, length=2 -> elements [1, 2]
853+
// Entry 1: null with garbage values (offset=9999, length=9999)
854+
// Entry 2: valid, offset=2, length=2 -> elements [3, 4]
855+
unsafe {
856+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
857+
entries[0] = duckdb_list_entry {
858+
offset: 0,
859+
length: 2,
860+
};
861+
// Garbage values that would cause a panic if we tried to use them.
862+
entries[1] = duckdb_list_entry {
863+
offset: 9999,
864+
length: 9999,
865+
};
866+
entries[2] = duckdb_list_entry {
867+
offset: 2,
868+
length: 2,
869+
};
870+
let mut child = vector.list_vector_get_child();
871+
let slice = child.as_slice_mut::<i32>(child_values.len());
872+
slice.copy_from_slice(&child_values);
873+
}
874+
875+
// Set entry 1 as null.
876+
let validity_slice = unsafe { vector.ensure_validity_bitslice(len) };
877+
validity_slice.set(1, false);
878+
879+
// Test conversion. The old code would compute child_min_length as 9999+9999=19998, which
880+
// would panic when trying to read that much data from the child vector.
881+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
882+
let vortex_array = result.to_listview();
883+
884+
assert_eq!(vortex_array.len(), len);
885+
886+
// Valid entries should work correctly.
887+
assert_eq!(
888+
vortex_array
889+
.list_elements_at(0)
890+
.to_primitive()
891+
.as_slice::<i32>(),
892+
&[1, 2]
893+
);
894+
assert_eq!(
895+
vortex_array
896+
.list_elements_at(2)
897+
.to_primitive()
898+
.as_slice::<i32>(),
899+
&[3, 4]
900+
);
901+
902+
// Verify the null entry has sanitized offset/size (offset=2, size=0) rather than garbage.
903+
let offsets = vortex_array.offsets().to_primitive();
904+
let sizes = vortex_array.sizes().to_primitive();
905+
assert_eq!(offsets.as_slice::<i64>()[1], 2); // Previous end (0+2).
906+
assert_eq!(sizes.as_slice::<i64>()[1], 0);
907+
908+
assert_eq!(
909+
vortex_array.validity_mask(),
910+
Mask::from_indices(3, vec![0, 2])
911+
);
912+
}
655913
}

vortex-duckdb/src/duckdb/vector.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ pub struct ValidityRef<'a> {
303303
}
304304

305305
impl ValidityRef<'_> {
306+
#[inline]
306307
pub fn is_valid(&self, row: usize) -> bool {
307308
let Some(validity) = self.validity else {
308309
return true;

0 commit comments

Comments
 (0)