Skip to content

Commit 9159495

Browse files
authored
Fix: support widening of ListViewVector offsets and sizes (#5796)
Since the list view "logical" type does not include the sizes and offsets as part of its type, we should allow resizing. Signed-off-by: Connor Tsui <[email protected]>
1 parent 8113798 commit 9159495

File tree

11 files changed

+385
-85
lines changed

11 files changed

+385
-85
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vortex-array/src/arrays/list/vtable/kernel/filter.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ impl ExecuteParentKernel<ListVTable> for ListFilterKernel {
6363
let mut vec = ListViewVectorMut::with_capacity(
6464
array.elements().dtype(),
6565
selection.true_count(),
66+
0,
6667
);
6768
vec.append_nulls(selection.true_count());
6869
return Ok(Some(vec.freeze().into()));

vortex-compute/src/cast/pvector.rs

Lines changed: 2 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,12 @@
22
// SPDX-FileCopyrightText: Copyright the Vortex contributors
33

44
use num_traits::NumCast;
5-
use vortex_buffer::Buffer;
6-
use vortex_buffer::BufferMut;
75
use vortex_dtype::DType;
86
use vortex_dtype::NativePType;
97
use vortex_dtype::match_each_native_ptype;
108
use vortex_error::VortexResult;
119
use vortex_error::vortex_bail;
1210
use vortex_error::vortex_err;
13-
use vortex_mask::AllOr;
14-
use vortex_mask::Mask;
1511
use vortex_vector::Scalar;
1612
use vortex_vector::ScalarOps;
1713
use vortex_vector::Vector;
@@ -20,6 +16,7 @@ use vortex_vector::primitive::PScalar;
2016
use vortex_vector::primitive::PVector;
2117
use vortex_vector::primitive::PrimitiveScalar;
2218
use vortex_vector::primitive::PrimitiveVector;
19+
use vortex_vector::primitive::cast;
2320

2421
use crate::cast::Cast;
2522
use crate::cast::try_cast_scalar_common;
@@ -44,7 +41,7 @@ impl<T: NativePType> Cast for PVector<T> {
4441
// We can possibly convert to the target `PType` and we have compatible nullability.
4542
DType::Primitive(target_ptype, n) if n.is_nullable() || self.validity().all_true() => {
4643
match_each_native_ptype!(*target_ptype, |Dst| {
47-
let result = cast_pvector::<T, Dst>(self)?;
44+
let result = cast::cast_pvector::<T, Dst>(self)?;
4845
Ok(PrimitiveVector::from(result).into())
4946
})
5047
}
@@ -55,48 +52,6 @@ impl<T: NativePType> Cast for PVector<T> {
5552
}
5653
}
5754

58-
/// Cast a [`PVector<F>`] to a [`PVector<T>`] by converting each element.
59-
///
60-
/// Returns an error if any valid element cannot be converted (e.g., overflow).
61-
fn cast_pvector<Src: NativePType, Dst: NativePType>(
62-
src: &PVector<Src>,
63-
) -> VortexResult<PVector<Dst>> {
64-
let elements: &[Src] = src.as_ref();
65-
match src.validity().bit_buffer() {
66-
AllOr::All => {
67-
let mut buffer = BufferMut::with_capacity(elements.len());
68-
for &item in elements {
69-
let converted = <Dst as NumCast>::from(item).ok_or_else(
70-
|| vortex_err!(ComputeError: "Failed to cast {} to {:?}", item, Dst::PTYPE),
71-
)?;
72-
// SAFETY: We pre-allocated the required capacity.
73-
unsafe { buffer.push_unchecked(converted) }
74-
}
75-
Ok(PVector::from(buffer.freeze()))
76-
}
77-
AllOr::None => Ok(PVector::new(
78-
Buffer::zeroed(elements.len()),
79-
Mask::new_false(elements.len()),
80-
)),
81-
AllOr::Some(bit_buffer) => {
82-
let mut buffer = BufferMut::with_capacity(elements.len());
83-
for (&item, valid) in elements.iter().zip(bit_buffer.iter()) {
84-
if valid {
85-
let converted = <Dst as NumCast>::from(item).ok_or_else(
86-
|| vortex_err!(ComputeError: "Failed to cast {} to {:?}", item, Dst::PTYPE),
87-
)?;
88-
// SAFETY: We pre-allocated the required capacity.
89-
unsafe { buffer.push_unchecked(converted) }
90-
} else {
91-
// SAFETY: We pre-allocated the required capacity.
92-
unsafe { buffer.push_unchecked(Dst::default()) }
93-
}
94-
}
95-
Ok(PVector::new(buffer.freeze(), src.validity().clone()))
96-
}
97-
}
98-
}
99-
10055
impl<T: NativePType> Cast for PScalar<T> {
10156
type Output = Scalar;
10257

vortex-dtype/src/ptype.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,56 @@ impl PType {
767767
}
768768
}
769769
}
770+
771+
/// Returns the minimum unsigned integer [`PType`] that can represent the given value.
772+
#[inline]
773+
pub const fn min_unsigned_ptype_for_value(value: u64) -> Self {
774+
if value <= u8::MAX as u64 {
775+
Self::U8
776+
} else if value <= u16::MAX as u64 {
777+
Self::U16
778+
} else if value <= u32::MAX as u64 {
779+
Self::U32
780+
} else {
781+
Self::U64
782+
}
783+
}
784+
785+
/// Returns the minimum signed integer [`PType`] that can represent the given value.
786+
#[inline]
787+
pub const fn min_signed_ptype_for_value(value: i64) -> Self {
788+
if value >= i8::MIN as i64 && value <= i8::MAX as i64 {
789+
Self::I8
790+
} else if value >= i16::MIN as i64 && value <= i16::MAX as i64 {
791+
Self::I16
792+
} else if value >= i32::MIN as i64 && value <= i32::MAX as i64 {
793+
Self::I32
794+
} else {
795+
Self::I64
796+
}
797+
}
798+
799+
/// Returns the wider of two unsigned integer [`PType`]s based on byte width.
800+
#[inline]
801+
pub const fn max_unsigned_ptype(self, other: Self) -> Self {
802+
debug_assert!(self.is_unsigned_int() && other.is_unsigned_int());
803+
if self.byte_width() >= other.byte_width() {
804+
self
805+
} else {
806+
other
807+
}
808+
}
809+
810+
/// Returns the wider of two signed integer [`PType`]s based on byte width.
811+
#[inline]
812+
pub const fn max_signed_ptype(self, other: Self) -> Self {
813+
debug_assert!(self.is_signed_int() && other.is_signed_int());
814+
if self.byte_width() >= other.byte_width() {
815+
self
816+
} else {
817+
other
818+
}
819+
}
770820
}
771821

772822
impl Display for PType {

vortex-scalar/src/vectors.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ impl Scalar {
7272
let lscalar = self.as_list();
7373
match lscalar.elements() {
7474
None => {
75-
let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1);
75+
let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1, 0);
7676
list_view.append_nulls(1);
7777
ListViewScalar::new(list_view.freeze()).into()
7878
}

vortex-vector/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@ vortex-dtype = { workspace = true }
2525
vortex-error = { workspace = true }
2626
vortex-mask = { workspace = true }
2727

28+
num-traits = { workspace = true }
2829
paste = { workspace = true }
2930
static_assertions = { workspace = true }

vortex-vector/src/listview/tests.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,3 +412,66 @@ fn test_try_into_mut() {
412412
let original = mut_result2.unwrap_err();
413413
assert_eq!(original.len(), 2);
414414
}
415+
416+
#[test]
417+
fn test_extend_upcasts_offset_and_size_types() {
418+
use vortex_dtype::PType;
419+
420+
// Create first list with u8 offsets/sizes and u8::MAX elements
421+
// This forces upcasting when extended with larger types
422+
let elements1 = PVectorMut::from_iter((0..u8::MAX as i32).map(Some));
423+
let offsets1 = PVectorMut::from_iter([0u8]).into();
424+
let sizes1 = PVectorMut::from_iter([u8::MAX]).into();
425+
let validity1 = MaskMut::new_true(1);
426+
427+
let mut list = ListViewVectorMut::new(Box::new(elements1.into()), offsets1, sizes1, validity1);
428+
429+
// Verify initial types
430+
assert_eq!(list.offsets().ptype(), PType::U8);
431+
assert_eq!(list.sizes().ptype(), PType::U8);
432+
433+
// Create second list with u32 offsets and u16 sizes, with u16::MAX elements
434+
let elements2: PrimitiveVector = PVectorMut::from_iter((0..u16::MAX as i32).map(Some))
435+
.freeze()
436+
.into();
437+
let offsets2: PrimitiveVector = PVectorMut::from_iter([0u32]).freeze().into();
438+
let sizes2: PrimitiveVector = PVectorMut::from_iter([u16::MAX]).freeze().into();
439+
let validity2 = Mask::new_true(1);
440+
441+
let list2 = ListViewVector::new(Arc::new(elements2.into()), offsets2, sizes2, validity2);
442+
443+
// Extend - this should upcast offsets to u32 and sizes to u16
444+
list.extend_from_vector(&list2);
445+
446+
// Verify types were upcasted
447+
assert_eq!(list.offsets().ptype(), PType::U32);
448+
assert_eq!(list.sizes().ptype(), PType::U16);
449+
450+
// Verify lengths
451+
assert_eq!(list.len(), 2);
452+
453+
let frozen = list.freeze();
454+
455+
// Check that first list's offset is still 0 and size is u8::MAX
456+
let offsets = frozen.offsets();
457+
let sizes = frozen.sizes();
458+
459+
let (o0, o1) = match offsets {
460+
PrimitiveVector::U32(pvec) => (*pvec.get(0).unwrap(), *pvec.get(1).unwrap()),
461+
_ => panic!("Expected U32 offsets"),
462+
};
463+
let (s0, s1) = match sizes {
464+
PrimitiveVector::U16(pvec) => (*pvec.get(0).unwrap(), *pvec.get(1).unwrap()),
465+
_ => panic!("Expected U16 sizes"),
466+
};
467+
468+
assert_eq!(o0, 0);
469+
assert_eq!(s0, u8::MAX as u16);
470+
// Second list's offset should be adjusted by first list's element count (u8::MAX)
471+
assert_eq!(o1, u8::MAX as u32);
472+
assert_eq!(s1, u16::MAX);
473+
474+
// Verify element count
475+
let total_elements = (u8::MAX as usize) + (u16::MAX as usize);
476+
assert_eq!(frozen.elements().len(), total_elements);
477+
}

vortex-vector/src/listview/vector_mut.rs

Lines changed: 59 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
use std::sync::Arc;
77

8+
use num_traits::NumCast;
89
use vortex_dtype::DType;
910
use vortex_dtype::PType;
1011
use vortex_error::VortexExpect;
@@ -184,12 +185,17 @@ impl ListViewVectorMut {
184185
}
185186

186187
/// Creates a new [`ListViewVectorMut`] with the specified capacity.
187-
pub fn with_capacity(element_dtype: &DType, capacity: usize) -> Self {
188+
pub fn with_capacity(element_dtype: &DType, capacity: usize, elements_capacity: usize) -> Self {
189+
let offsets_ptype = PType::min_unsigned_ptype_for_value(elements_capacity as u64);
190+
let sizes_ptype = offsets_ptype;
191+
192+
// SAFETY: Everything is empty and the offsets and sizes `PType` is the same, so all
193+
// invariants are satisfied.
188194
unsafe {
189195
Self::new_unchecked(
190196
Box::new(VectorMut::with_capacity(element_dtype, 0)),
191-
PrimitiveVectorMut::with_capacity(PType::U64, capacity),
192-
PrimitiveVectorMut::with_capacity(PType::U32, capacity),
197+
PrimitiveVectorMut::with_capacity(offsets_ptype, capacity),
198+
PrimitiveVectorMut::with_capacity(sizes_ptype, capacity),
193199
MaskMut::with_capacity(capacity),
194200
)
195201
}
@@ -300,27 +306,23 @@ impl VectorMutOps for ListViewVectorMut {
300306
self.len = self.validity.len();
301307
}
302308

303-
/// This will also panic if we try to extend the `ListViewVector` beyond the maximum offset
304-
/// representable by the type of the `offsets` primitive vector.
305309
fn extend_from_vector(&mut self, other: &ListViewVector) {
306310
// Extend the elements with the other's elements.
307311
let old_elements_len = self.elements.len() as u64;
308312
self.elements.extend_from_vector(&other.elements);
309313
let new_elements_len = self.elements.len() as u64;
310314

311-
// Then extend the sizes with the other's sizes (these do not need any adjustment).
312-
self.sizes.extend_from_vector(&other.sizes);
315+
// Extend sizes with automatic upcasting (does not panic on type mismatch).
316+
self.sizes.extend_from_vector_with_upcast(&other.sizes);
313317

314-
// We need this assertion to ensure that the casts below are infallible.
315-
assert!(
316-
new_elements_len < self.offsets.ptype().max_value_as_u64(),
317-
"the elements length {new_elements_len} is not representable by the offsets type {}",
318-
self.offsets.ptype()
318+
// Extend offsets with adjustment and automatic upcasting based on `new_elements_len`.
319+
adjust_and_extend_offsets(
320+
&mut self.offsets,
321+
&other.offsets,
322+
old_elements_len,
323+
new_elements_len,
319324
);
320325

321-
// Finally, extend the offsets after adding the old `elements` length to each.
322-
adjust_and_extend_offsets(&mut self.offsets, &other.offsets, old_elements_len);
323-
324326
self.validity.append_mask(&other.validity);
325327
self.len += other.len;
326328
debug_assert_eq!(self.len, self.validity.len());
@@ -505,39 +507,60 @@ fn validate_views_bound(
505507
Ok(())
506508
}
507509

508-
// TODO(connor): It would be better to separate everything inside the macros into its own function,
509-
// but that would require adding another macro that sets a type `$type` to be used by the caller.
510-
/// Checks that all views are `<= elements_len`.
510+
/// Adjusts and extends offsets from `new_offsets` into `curr_offsets`, upcasting if needed.
511+
///
512+
/// Each offset from `new_offsets` is adjusted by adding `old_elements_len` before appending.
513+
///
514+
/// If the resulting offsets would exceed the current offset type's capacity, the offset vector is
515+
/// automatically upcasted to a wider type.
511516
#[expect(
512517
clippy::cognitive_complexity,
513518
reason = "complexity from nested match_each_* macros"
514519
)]
515520
fn adjust_and_extend_offsets(
516-
our_offsets: &mut PrimitiveVectorMut,
517-
other: &PrimitiveVector,
521+
curr_offsets: &mut PrimitiveVectorMut,
522+
new_offsets: &PrimitiveVector,
518523
old_elements_len: u64,
524+
new_elements_len: u64,
519525
) {
520-
our_offsets.reserve(other.len());
526+
// Make sure we use the correct width to fit all offsets.
527+
let target_ptype = PType::min_unsigned_ptype_for_value(new_elements_len)
528+
.max_unsigned_ptype(curr_offsets.ptype())
529+
.max_unsigned_ptype(new_offsets.ptype());
530+
531+
if curr_offsets.ptype() != target_ptype {
532+
let old_offsets = std::mem::replace(
533+
curr_offsets,
534+
PrimitiveVectorMut::with_capacity(target_ptype, 0),
535+
);
536+
*curr_offsets = old_offsets.upcast(target_ptype);
537+
}
521538

522-
// Adjust each offset from `other` by adding the current elements length to each of the
539+
curr_offsets.reserve(new_offsets.len());
540+
541+
// Adjust each offset from `new_offsets` by adding the current elements length to each of the
523542
// incoming offsets.
524-
match_each_integer_pvector_mut!(our_offsets, |self_offsets| {
525-
match_each_integer_pvector!(other, |other_offsets| {
526-
let other_offsets_slice = other_offsets.as_ref();
527-
528-
// Append each offset from `other`, adjusted by the elements_offset.
529-
for i in 0..other.len() {
530-
// All offset types are representable via a `u64` since we also ensure offsets
531-
// are always non-negative.
543+
match_each_integer_pvector_mut!(curr_offsets, |curr| {
544+
match_each_integer_pvector!(new_offsets, |new| {
545+
let new_offsets_slice = new.as_ref();
546+
547+
// Append each offset from `new_offsets`, adjusted by the elements_offset.
548+
for i in 0..new.len() {
549+
// All offset types are representable via a `u64` since we also ensure offsets are
550+
// always non-negative.
532551
#[allow(clippy::unnecessary_cast)]
533-
let adjusted_offset = other_offsets_slice[i] as u64 + old_elements_len;
552+
let adjusted_offset = new_offsets_slice[i] as u64 + old_elements_len;
553+
debug_assert!(
554+
adjusted_offset < new_elements_len,
555+
"new list view offset is somehow out of bounds, something has gone wrong"
556+
);
534557

535-
// SAFETY: We just reserved capacity for `other.len()` elements above, and we
536-
// also know the cast is fine because we verified above that the maximum
537-
// possible offset is representable by the offset type.
538-
#[allow(clippy::cast_possible_truncation)]
558+
let converted = NumCast::from(adjusted_offset)
559+
.vortex_expect("offset conversion should succeed after upcast");
560+
561+
// SAFETY: We reserved capacity for `new_offsets.len()` elements above.
539562
unsafe {
540-
self_offsets.push_unchecked(adjusted_offset as _);
563+
curr.push_unchecked(converted);
541564
}
542565
}
543566
});

0 commit comments

Comments
 (0)