Skip to content

Commit 36af150

Browse files
committed
fix incorrect child vector size in duckdb list array import
Signed-off-by: Connor Tsui <[email protected]>
1 parent 4c65aa5 commit 36af150

File tree

2 files changed

+285
-23
lines changed

2 files changed

+285
-23
lines changed

vortex-duckdb/src/convert/vector.rs

Lines changed: 284 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,106 @@ fn vector_as_string_blob(vector: &mut Vector, len: usize, dtype: DType) -> Array
109109
builder.finish()
110110
}
111111

112+
/// Converts a valid [`duckdb_list_entry`] to `(offset, size)`, updating tracking state.
113+
///
114+
/// Updates `child_max_length` with the maximum end offset seen so far, and `previous_end` with this
115+
/// entry's end offset (for use as the offset of subsequent null entries).
116+
///
117+
/// Panics if the offset or size are negative or don't fit in the expected types.
118+
fn convert_valid_list_entry(
119+
entry: &duckdb_list_entry,
120+
child_max_length: &mut usize,
121+
previous_end: &mut i64,
122+
) -> (i64, i64) {
123+
let offset = i64::try_from(entry.offset).vortex_expect("offset must fit i64");
124+
assert!(offset >= 0, "offset must be non-negative");
125+
let size = i64::try_from(entry.length).vortex_expect("length must fit i64");
126+
assert!(size >= 0, "size must be non-negative");
127+
128+
let end = usize::try_from(offset + size)
129+
.vortex_expect("child vector length did not fit into a 32-bit `usize` type");
130+
131+
*child_max_length = (*child_max_length).max(end);
132+
*previous_end = offset + size;
133+
134+
(offset, size)
135+
}
136+
137+
/// Processes DuckDB list entries with validity to produce Vortex-compatible `ListView` offsets and
138+
/// sizes.
139+
///
140+
/// Returns `(offsets, sizes, child_max_length)` where `child_max_length` is the maximum end offset
141+
/// across all valid list entries, used to determine the child vector length coming from DuckDB.
142+
///
143+
/// Null list views in DuckDB may contain garbage offset/size values (which is different from the
144+
/// Arrow specification), so we must check validity before reading them.
145+
///
146+
/// For null entries, we set the offset to the previous list's end and size to 0 so that:
147+
/// 1. We don't accidentally read garbage data from the child vector.
148+
/// 2. The null entries remain in (mostly) sorted offset order, which can potentially simplify
149+
/// downstream operations like converting `ListView` to `List`.
150+
fn process_duckdb_lists(
151+
entries: &[duckdb_list_entry],
152+
validity: &Validity,
153+
) -> (Buffer<i64>, Buffer<i64>, usize) {
154+
let len = entries.len();
155+
let mut offsets = BufferMut::with_capacity(len);
156+
let mut sizes = BufferMut::with_capacity(len);
157+
158+
match validity {
159+
Validity::NonNullable | Validity::AllValid => {
160+
// All entries are valid, so there is no need to check the validity.
161+
let mut child_max_length = 0;
162+
let mut previous_end = 0;
163+
for entry in entries {
164+
let (offset, size) =
165+
convert_valid_list_entry(entry, &mut child_max_length, &mut previous_end);
166+
167+
// SAFETY: We allocated enough capacity above.
168+
unsafe {
169+
offsets.push_unchecked(offset);
170+
sizes.push_unchecked(size);
171+
}
172+
}
173+
(offsets.freeze(), sizes.freeze(), child_max_length)
174+
}
175+
Validity::AllInvalid => {
176+
// All entries are null, so we can just set offset=0 and size=0.
177+
// SAFETY: We allocated enough capacity above.
178+
unsafe {
179+
offsets.push_n_unchecked(0, len);
180+
sizes.push_n_unchecked(0, len);
181+
}
182+
(offsets.freeze(), sizes.freeze(), 0)
183+
}
184+
Validity::Array(_) => {
185+
// We have some number of nulls, so make sure to check validity before updating info.
186+
let mask = validity.to_mask(len);
187+
let child_max_length = mask.iter_bools(|validity_iter| {
188+
let mut child_max_length = 0;
189+
let mut previous_end = 0;
190+
191+
for (entry, is_valid) in entries.iter().zip(validity_iter) {
192+
let (offset, size) = if is_valid {
193+
convert_valid_list_entry(entry, &mut child_max_length, &mut previous_end)
194+
} else {
195+
(previous_end, 0)
196+
};
197+
198+
// SAFETY: We allocated enough capacity above.
199+
unsafe {
200+
offsets.push_unchecked(offset);
201+
sizes.push_unchecked(size);
202+
}
203+
}
204+
205+
child_max_length
206+
});
207+
(offsets.freeze(), sizes.freeze(), child_max_length)
208+
}
209+
}
210+
}
211+
112212
/// Converts flat vector to a vortex array
113213
pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<ArrayRef> {
114214
let type_id = vector.logical_type().as_type_id();
@@ -221,33 +321,18 @@ pub fn flat_vector_to_vortex(vector: &mut Vector, len: usize) -> VortexResult<Ar
221321
.map(|a| a.into_array())
222322
}
223323
DUCKDB_TYPE::DUCKDB_TYPE_LIST => {
224-
let mut offsets = BufferMut::with_capacity(len);
225-
let mut lengths = BufferMut::with_capacity(len);
226-
for duckdb_list_entry { offset, length } in
227-
vector.as_slice_with_len::<duckdb_list_entry>(len)
228-
{
229-
unsafe {
230-
offsets.push_unchecked(
231-
i64::try_from(*offset).vortex_expect("offset must fit i64"),
232-
);
233-
lengths.push_unchecked(
234-
i64::try_from(*length).vortex_expect("length must fit i64"),
235-
);
236-
}
237-
}
238-
let offsets = offsets.freeze();
239-
let lengths = lengths.freeze();
240-
let child_data = flat_vector_to_vortex(
241-
&mut vector.list_vector_get_child(),
242-
usize::try_from(offsets[len - 1] + lengths[len - 1])
243-
.vortex_expect("last offset and length sum must fit in usize "),
244-
)?;
324+
let validity = vector.validity_ref(len).to_validity();
325+
let entries = vector.as_slice_with_len::<duckdb_list_entry>(len);
326+
327+
let (offsets, sizes, child_max_length) = process_duckdb_lists(entries, &validity);
328+
let child_data =
329+
flat_vector_to_vortex(&mut vector.list_vector_get_child(), child_max_length)?;
245330

246331
ListViewArray::try_new(
247332
child_data,
248333
offsets.into_array(),
249-
lengths.into_array(),
250-
vector.validity_ref(len).to_validity(),
334+
sizes.into_array(),
335+
validity,
251336
)
252337
.map(|a| a.into_array())
253338
}
@@ -651,4 +736,180 @@ mod tests {
651736
&[5, 6, 7, 8]
652737
);
653738
}
739+
740+
#[test]
741+
fn test_list_with_trailing_null() {
742+
// Regression test: when the last list entry is null, its offset/length may be 0/0,
743+
// so we can't use the last entry to compute child vector length.
744+
let child_values = vec![1i32, 2, 3, 4];
745+
let len = 2;
746+
747+
let logical_type =
748+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER))
749+
.vortex_unwrap();
750+
let mut vector = Vector::with_capacity(logical_type, len);
751+
752+
// Entry 0: offset=0, length=4 -> all elements (end=4)
753+
// Entry 1: null, offset=0, length=0 (end=0)
754+
unsafe {
755+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
756+
entries[0] = duckdb_list_entry {
757+
offset: 0,
758+
length: child_values.len() as u64,
759+
};
760+
entries[1] = duckdb_list_entry {
761+
offset: 0,
762+
length: 0,
763+
};
764+
let mut child = vector.list_vector_get_child();
765+
let slice = child.as_slice_mut::<i32>(child_values.len());
766+
slice.copy_from_slice(&child_values);
767+
}
768+
769+
// Set the second entry as null.
770+
let validity_slice = unsafe { vector.ensure_validity_bitslice(len) };
771+
validity_slice.set(1, false);
772+
773+
// Test conversion - the old bug would compute child length as 0+0=0 instead of
774+
// max(4,0)=4.
775+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
776+
let vortex_array = result.to_listview();
777+
778+
assert_eq!(vortex_array.len(), len);
779+
assert_eq!(
780+
vortex_array
781+
.list_elements_at(0)
782+
.to_primitive()
783+
.as_slice::<i32>(),
784+
&[1, 2, 3, 4]
785+
);
786+
assert_eq!(vortex_array.validity_mask(), Mask::from_indices(2, vec![0]));
787+
}
788+
789+
#[test]
790+
fn test_list_out_of_order() {
791+
// Regression test: list views can be out of order in DuckDB. The child vector length
792+
// must be computed as the maximum end offset, not just the last entry's end offset.
793+
let child_values = vec![1i32, 2, 3, 4];
794+
let len = 2;
795+
796+
let logical_type =
797+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER))
798+
.vortex_unwrap();
799+
let mut vector = Vector::with_capacity(logical_type, len);
800+
801+
// Populate with out-of-order list entries:
802+
// - Entry 0: offset=2, length=2 -> elements [3, 4] (end=4)
803+
// - Entry 1: offset=0, length=2 -> elements [1, 2] (end=2)
804+
unsafe {
805+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
806+
entries[0] = duckdb_list_entry {
807+
offset: 2,
808+
length: 2,
809+
};
810+
entries[1] = duckdb_list_entry {
811+
offset: 0,
812+
length: 2,
813+
};
814+
let mut child = vector.list_vector_get_child();
815+
let slice = child.as_slice_mut::<i32>(child_values.len());
816+
slice.copy_from_slice(&child_values);
817+
}
818+
819+
// Test conversion - the old bug would compute child length as 0+2=2 instead of
820+
// max(4,2)=4.
821+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
822+
let vortex_array = result.to_listview();
823+
824+
assert_eq!(vortex_array.len(), len);
825+
assert_eq!(
826+
vortex_array
827+
.list_elements_at(0)
828+
.to_primitive()
829+
.as_slice::<i32>(),
830+
&[3, 4]
831+
);
832+
assert_eq!(
833+
vortex_array
834+
.list_elements_at(1)
835+
.to_primitive()
836+
.as_slice::<i32>(),
837+
&[1, 2]
838+
);
839+
}
840+
841+
#[test]
842+
fn test_list_null_garbage_data() {
843+
// Test that null list entries with garbage offset/size values don't cause issues.
844+
// DuckDB doesn't guarantee valid offset/size for null list views, so we must check
845+
// validity before reading the offset/size values.
846+
let child_values = vec![1i32, 2, 3, 4];
847+
let len = 3;
848+
849+
let logical_type =
850+
LogicalType::list_type(LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_INTEGER))
851+
.vortex_unwrap();
852+
let mut vector = Vector::with_capacity(logical_type, len);
853+
854+
// Entry 0: valid, offset=0, length=2 -> elements [1, 2]
855+
// Entry 1: null with garbage values (offset=9999, length=9999)
856+
// Entry 2: valid, offset=2, length=2 -> elements [3, 4]
857+
unsafe {
858+
let entries = vector.as_slice_mut::<duckdb_list_entry>(len);
859+
entries[0] = duckdb_list_entry {
860+
offset: 0,
861+
length: 2,
862+
};
863+
// Garbage values that would cause a panic if we tried to use them.
864+
entries[1] = duckdb_list_entry {
865+
offset: 9999,
866+
length: 9999,
867+
};
868+
entries[2] = duckdb_list_entry {
869+
offset: 2,
870+
length: 2,
871+
};
872+
let mut child = vector.list_vector_get_child();
873+
let slice = child.as_slice_mut::<i32>(child_values.len());
874+
slice.copy_from_slice(&child_values);
875+
}
876+
877+
// Set entry 1 as null.
878+
let validity_slice = unsafe { vector.ensure_validity_bitslice(len) };
879+
validity_slice.set(1, false);
880+
881+
// Test conversion. The old code would compute child_max_length as 9999+9999=19998, which
882+
// would panic when trying to read that much data from the child vector.
883+
let result = flat_vector_to_vortex(&mut vector, len).unwrap();
884+
let vortex_array = result.to_listview();
885+
886+
assert_eq!(vortex_array.len(), len);
887+
888+
// Valid entries should work correctly.
889+
assert_eq!(
890+
vortex_array
891+
.list_elements_at(0)
892+
.to_primitive()
893+
.as_slice::<i32>(),
894+
&[1, 2]
895+
);
896+
assert_eq!(
897+
vortex_array
898+
.list_elements_at(2)
899+
.to_primitive()
900+
.as_slice::<i32>(),
901+
&[3, 4]
902+
);
903+
904+
// Verify the null entry has sanitized offset/size (offset=2, size=0) rather than garbage.
905+
let offsets = vortex_array.offsets().to_primitive();
906+
let sizes = vortex_array.sizes().to_primitive();
907+
assert_eq!(offsets.as_slice::<i64>()[1], 2); // Previous end (0+2).
908+
assert_eq!(sizes.as_slice::<i64>()[1], 0);
909+
910+
assert_eq!(
911+
vortex_array.validity_mask(),
912+
Mask::from_indices(3, vec![0, 2])
913+
);
914+
}
654915
}

vortex-duckdb/src/duckdb/vector.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ pub struct ValidityRef<'a> {
303303
}
304304

305305
impl ValidityRef<'_> {
306+
#[inline]
306307
pub fn is_valid(&self, row: usize) -> bool {
307308
let Some(validity) = self.validity else {
308309
return true;

0 commit comments

Comments
 (0)