vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vortex-array/src/arrays/list/vtable/kernel/filter.rs‎
Lines changed: 1 addition & 0 deletions b/‎vortex-array/src/arrays/list/vtable/kernel/filter.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vortex-compute/src/cast/pvector.rs‎
Lines changed: 2 additions & 47 deletions b/‎vortex-compute/src/cast/pvector.rs‎
Lines changed: 2 additions & 47 deletions
diff --git a/‎vortex-dtype/src/ptype.rs‎
Lines changed: 50 additions & 0 deletions b/‎vortex-dtype/src/ptype.rs‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎vortex-scalar/src/vectors.rs‎
Lines changed: 1 addition & 1 deletion b/‎vortex-scalar/src/vectors.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-vector/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎vortex-vector/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vortex-vector/src/listview/tests.rs‎
Lines changed: 63 additions & 0 deletions b/‎vortex-vector/src/listview/tests.rs‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎vortex-vector/src/listview/vector_mut.rs‎
Lines changed: 59 additions & 36 deletions b/‎vortex-vector/src/listview/vector_mut.rs‎
Lines changed: 59 additions & 36 deletions
@@ -63,6 +63,7 @@ impl ExecuteParentKernel<ListVTable> for ListFilterKernel {
                 let mut vec = ListViewVectorMut::with_capacity(
                     array.elements().dtype(),
                     selection.true_count(),
+                    0,
                 );
                 vec.append_nulls(selection.true_count());
                 return Ok(Some(vec.freeze().into()));
 
@@ -2,16 +2,12 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use num_traits::NumCast;
-use vortex_buffer::Buffer;
-use vortex_buffer::BufferMut;
 use vortex_dtype::DType;
 use vortex_dtype::NativePType;
 use vortex_dtype::match_each_native_ptype;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 use vortex_error::vortex_err;
-use vortex_mask::AllOr;
-use vortex_mask::Mask;
 use vortex_vector::Scalar;
 use vortex_vector::ScalarOps;
 use vortex_vector::Vector;
@@ -20,6 +16,7 @@ use vortex_vector::primitive::PScalar;
 use vortex_vector::primitive::PVector;
 use vortex_vector::primitive::PrimitiveScalar;
 use vortex_vector::primitive::PrimitiveVector;
+use vortex_vector::primitive::cast;
 
 use crate::cast::Cast;
 use crate::cast::try_cast_scalar_common;
@@ -44,7 +41,7 @@ impl<T: NativePType> Cast for PVector<T> {
             // We can possibly convert to the target `PType` and we have compatible nullability.
             DType::Primitive(target_ptype, n) if n.is_nullable() || self.validity().all_true() => {
                 match_each_native_ptype!(*target_ptype, |Dst| {
-                    let result = cast_pvector::<T, Dst>(self)?;
+                    let result = cast::cast_pvector::<T, Dst>(self)?;
                     Ok(PrimitiveVector::from(result).into())
                 })
             }
@@ -55,48 +52,6 @@ impl<T: NativePType> Cast for PVector<T> {
     }
 }
 
-/// Cast a [`PVector<F>`] to a [`PVector<T>`] by converting each element.
-///
-/// Returns an error if any valid element cannot be converted (e.g., overflow).
-fn cast_pvector<Src: NativePType, Dst: NativePType>(
-    src: &PVector<Src>,
-) -> VortexResult<PVector<Dst>> {
-    let elements: &[Src] = src.as_ref();
-    match src.validity().bit_buffer() {
-        AllOr::All => {
-            let mut buffer = BufferMut::with_capacity(elements.len());
-            for &item in elements {
-                let converted = <Dst as NumCast>::from(item).ok_or_else(
-                    || vortex_err!(ComputeError: "Failed to cast {} to {:?}", item, Dst::PTYPE),
-                )?;
-                // SAFETY: We pre-allocated the required capacity.
-                unsafe { buffer.push_unchecked(converted) }
-            }
-            Ok(PVector::from(buffer.freeze()))
-        }
-        AllOr::None => Ok(PVector::new(
-            Buffer::zeroed(elements.len()),
-            Mask::new_false(elements.len()),
-        )),
-        AllOr::Some(bit_buffer) => {
-            let mut buffer = BufferMut::with_capacity(elements.len());
-            for (&item, valid) in elements.iter().zip(bit_buffer.iter()) {
-                if valid {
-                    let converted = <Dst as NumCast>::from(item).ok_or_else(
-                        || vortex_err!(ComputeError: "Failed to cast {} to {:?}", item, Dst::PTYPE),
-                    )?;
-                    // SAFETY: We pre-allocated the required capacity.
-                    unsafe { buffer.push_unchecked(converted) }
-                } else {
-                    // SAFETY: We pre-allocated the required capacity.
-                    unsafe { buffer.push_unchecked(Dst::default()) }
-                }
-            }
-            Ok(PVector::new(buffer.freeze(), src.validity().clone()))
-        }
-    }
-}
-
 impl<T: NativePType> Cast for PScalar<T> {
     type Output = Scalar;
 
 
@@ -767,6 +767,56 @@ impl PType {
             }
         }
     }
+
+    /// Returns the minimum unsigned integer [`PType`] that can represent the given value.
+    #[inline]
+    pub const fn min_unsigned_ptype_for_value(value: u64) -> Self {
+        if value <= u8::MAX as u64 {
+            Self::U8
+        } else if value <= u16::MAX as u64 {
+            Self::U16
+        } else if value <= u32::MAX as u64 {
+            Self::U32
+        } else {
+            Self::U64
+        }
+    }
+
+    /// Returns the minimum signed integer [`PType`] that can represent the given value.
+    #[inline]
+    pub const fn min_signed_ptype_for_value(value: i64) -> Self {
+        if value >= i8::MIN as i64 && value <= i8::MAX as i64 {
+            Self::I8
+        } else if value >= i16::MIN as i64 && value <= i16::MAX as i64 {
+            Self::I16
+        } else if value >= i32::MIN as i64 && value <= i32::MAX as i64 {
+            Self::I32
+        } else {
+            Self::I64
+        }
+    }
+
+    /// Returns the wider of two unsigned integer [`PType`]s based on byte width.
+    #[inline]
+    pub const fn max_unsigned_ptype(self, other: Self) -> Self {
+        debug_assert!(self.is_unsigned_int() && other.is_unsigned_int());
+        if self.byte_width() >= other.byte_width() {
+            self
+        } else {
+            other
+        }
+    }
+
+    /// Returns the wider of two signed integer [`PType`]s based on byte width.
+    #[inline]
+    pub const fn max_signed_ptype(self, other: Self) -> Self {
+        debug_assert!(self.is_signed_int() && other.is_signed_int());
+        if self.byte_width() >= other.byte_width() {
+            self
+        } else {
+            other
+        }
+    }
 }
 
 impl Display for PType {
 
@@ -72,7 +72,7 @@ impl Scalar {
                 let lscalar = self.as_list();
                 match lscalar.elements() {
                     None => {
-                        let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1);
+                        let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1, 0);
                         list_view.append_nulls(1);
                         ListViewScalar::new(list_view.freeze()).into()
                     }
 
@@ -25,5 +25,6 @@ vortex-dtype = { workspace = true }
 vortex-error = { workspace = true }
 vortex-mask = { workspace = true }
 
+num-traits = { workspace = true }
 paste = { workspace = true }
 static_assertions = { workspace = true }
@@ -412,3 +412,66 @@ fn test_try_into_mut() {
     let original = mut_result2.unwrap_err();
     assert_eq!(original.len(), 2);
 }
+
+#[test]
+fn test_extend_upcasts_offset_and_size_types() {
+    use vortex_dtype::PType;
+
+    // Create first list with u8 offsets/sizes and u8::MAX elements
+    // This forces upcasting when extended with larger types
+    let elements1 = PVectorMut::from_iter((0..u8::MAX as i32).map(Some));
+    let offsets1 = PVectorMut::from_iter([0u8]).into();
+    let sizes1 = PVectorMut::from_iter([u8::MAX]).into();
+    let validity1 = MaskMut::new_true(1);
+
+    let mut list = ListViewVectorMut::new(Box::new(elements1.into()), offsets1, sizes1, validity1);
+
+    // Verify initial types
+    assert_eq!(list.offsets().ptype(), PType::U8);
+    assert_eq!(list.sizes().ptype(), PType::U8);
+
+    // Create second list with u32 offsets and u16 sizes, with u16::MAX elements
+    let elements2: PrimitiveVector = PVectorMut::from_iter((0..u16::MAX as i32).map(Some))
+        .freeze()
+        .into();
+    let offsets2: PrimitiveVector = PVectorMut::from_iter([0u32]).freeze().into();
+    let sizes2: PrimitiveVector = PVectorMut::from_iter([u16::MAX]).freeze().into();
+    let validity2 = Mask::new_true(1);
+
+    let list2 = ListViewVector::new(Arc::new(elements2.into()), offsets2, sizes2, validity2);
+
+    // Extend - this should upcast offsets to u32 and sizes to u16
+    list.extend_from_vector(&list2);
+
+    // Verify types were upcasted
+    assert_eq!(list.offsets().ptype(), PType::U32);
+    assert_eq!(list.sizes().ptype(), PType::U16);
+
+    // Verify lengths
+    assert_eq!(list.len(), 2);
+
+    let frozen = list.freeze();
+
+    // Check that first list's offset is still 0 and size is u8::MAX
+    let offsets = frozen.offsets();
+    let sizes = frozen.sizes();
+
+    let (o0, o1) = match offsets {
+        PrimitiveVector::U32(pvec) => (*pvec.get(0).unwrap(), *pvec.get(1).unwrap()),
+        _ => panic!("Expected U32 offsets"),
+    };
+    let (s0, s1) = match sizes {
+        PrimitiveVector::U16(pvec) => (*pvec.get(0).unwrap(), *pvec.get(1).unwrap()),
+        _ => panic!("Expected U16 sizes"),
+    };
+
+    assert_eq!(o0, 0);
+    assert_eq!(s0, u8::MAX as u16);
+    // Second list's offset should be adjusted by first list's element count (u8::MAX)
+    assert_eq!(o1, u8::MAX as u32);
+    assert_eq!(s1, u16::MAX);
+
+    // Verify element count
+    let total_elements = (u8::MAX as usize) + (u16::MAX as usize);
+    assert_eq!(frozen.elements().len(), total_elements);
+}
@@ -5,6 +5,7 @@
 
 use std::sync::Arc;
 
+use num_traits::NumCast;
 use vortex_dtype::DType;
 use vortex_dtype::PType;
 use vortex_error::VortexExpect;
@@ -184,12 +185,17 @@ impl ListViewVectorMut {
     }
 
     /// Creates a new [`ListViewVectorMut`] with the specified capacity.
-    pub fn with_capacity(element_dtype: &DType, capacity: usize) -> Self {
+    pub fn with_capacity(element_dtype: &DType, capacity: usize, elements_capacity: usize) -> Self {
+        let offsets_ptype = PType::min_unsigned_ptype_for_value(elements_capacity as u64);
+        let sizes_ptype = offsets_ptype;
+
+        // SAFETY: Everything is empty and the offsets and sizes `PType` is the same, so all
+        // invariants are satisfied.
         unsafe {
             Self::new_unchecked(
                 Box::new(VectorMut::with_capacity(element_dtype, 0)),
-                PrimitiveVectorMut::with_capacity(PType::U64, capacity),
-                PrimitiveVectorMut::with_capacity(PType::U32, capacity),
+                PrimitiveVectorMut::with_capacity(offsets_ptype, capacity),
+                PrimitiveVectorMut::with_capacity(sizes_ptype, capacity),
                 MaskMut::with_capacity(capacity),
             )
         }
@@ -300,27 +306,23 @@ impl VectorMutOps for ListViewVectorMut {
         self.len = self.validity.len();
     }
 
-    /// This will also panic if we try to extend the `ListViewVector` beyond the maximum offset
-    /// representable by the type of the `offsets` primitive vector.
     fn extend_from_vector(&mut self, other: &ListViewVector) {
         // Extend the elements with the other's elements.
         let old_elements_len = self.elements.len() as u64;
         self.elements.extend_from_vector(&other.elements);
         let new_elements_len = self.elements.len() as u64;
 
-        // Then extend the sizes with the other's sizes (these do not need any adjustment).
-        self.sizes.extend_from_vector(&other.sizes);
+        // Extend sizes with automatic upcasting (does not panic on type mismatch).
+        self.sizes.extend_from_vector_with_upcast(&other.sizes);
 
-        // We need this assertion to ensure that the casts below are infallible.
-        assert!(
-            new_elements_len < self.offsets.ptype().max_value_as_u64(),
-            "the elements length {new_elements_len} is not representable by the offsets type {}",
-            self.offsets.ptype()
+        // Extend offsets with adjustment and automatic upcasting based on `new_elements_len`.
+        adjust_and_extend_offsets(
+            &mut self.offsets,
+            &other.offsets,
+            old_elements_len,
+            new_elements_len,
         );
 
-        // Finally, extend the offsets after adding the old `elements` length to each.
-        adjust_and_extend_offsets(&mut self.offsets, &other.offsets, old_elements_len);
-
         self.validity.append_mask(&other.validity);
         self.len += other.len;
         debug_assert_eq!(self.len, self.validity.len());
@@ -505,39 +507,60 @@ fn validate_views_bound(
     Ok(())
 }
 
-// TODO(connor): It would be better to separate everything inside the macros into its own function,
-// but that would require adding another macro that sets a type `$type` to be used by the caller.
-/// Checks that all views are `<= elements_len`.
+/// Adjusts and extends offsets from `new_offsets` into `curr_offsets`, upcasting if needed.
+///
+/// Each offset from `new_offsets` is adjusted by adding `old_elements_len` before appending.
+///
+/// If the resulting offsets would exceed the current offset type's capacity, the offset vector is
+/// automatically upcasted to a wider type.
 #[expect(
     clippy::cognitive_complexity,
     reason = "complexity from nested match_each_* macros"
 )]
 fn adjust_and_extend_offsets(
-    our_offsets: &mut PrimitiveVectorMut,
-    other: &PrimitiveVector,
+    curr_offsets: &mut PrimitiveVectorMut,
+    new_offsets: &PrimitiveVector,
     old_elements_len: u64,
+    new_elements_len: u64,
 ) {
-    our_offsets.reserve(other.len());
+    // Make sure we use the correct width to fit all offsets.
+    let target_ptype = PType::min_unsigned_ptype_for_value(new_elements_len)
+        .max_unsigned_ptype(curr_offsets.ptype())
+        .max_unsigned_ptype(new_offsets.ptype());
+
+    if curr_offsets.ptype() != target_ptype {
+        let old_offsets = std::mem::replace(
+            curr_offsets,
+            PrimitiveVectorMut::with_capacity(target_ptype, 0),
+        );
+        *curr_offsets = old_offsets.upcast(target_ptype);
+    }
 
-    // Adjust each offset from `other` by adding the current elements length to each of the
+    curr_offsets.reserve(new_offsets.len());
+
+    // Adjust each offset from `new_offsets` by adding the current elements length to each of the
     // incoming offsets.
-    match_each_integer_pvector_mut!(our_offsets, |self_offsets| {
-        match_each_integer_pvector!(other, |other_offsets| {
-            let other_offsets_slice = other_offsets.as_ref();
-
-            // Append each offset from `other`, adjusted by the elements_offset.
-            for i in 0..other.len() {
-                // All offset types are representable via a `u64` since we also ensure offsets
-                // are always non-negative.
+    match_each_integer_pvector_mut!(curr_offsets, |curr| {
+        match_each_integer_pvector!(new_offsets, |new| {
+            let new_offsets_slice = new.as_ref();
+
+            // Append each offset from `new_offsets`, adjusted by the elements_offset.
+            for i in 0..new.len() {
+                // All offset types are representable via a `u64` since we also ensure offsets are
+                // always non-negative.
                 #[allow(clippy::unnecessary_cast)]
-                let adjusted_offset = other_offsets_slice[i] as u64 + old_elements_len;
+                let adjusted_offset = new_offsets_slice[i] as u64 + old_elements_len;
+                debug_assert!(
+                    adjusted_offset < new_elements_len,
+                    "new list view offset is somehow out of bounds, something has gone wrong"
+                );
 
-                // SAFETY: We just reserved capacity for `other.len()` elements above, and we
-                // also know the cast is fine because we verified above that the maximum
-                // possible offset is representable by the offset type.
-                #[allow(clippy::cast_possible_truncation)]
+                let converted = NumCast::from(adjusted_offset)
+                    .vortex_expect("offset conversion should succeed after upcast");
+
+                // SAFETY: We reserved capacity for `new_offsets.len()` elements above.
                 unsafe {
-                    self_offsets.push_unchecked(adjusted_offset as _);
+                    curr.push_unchecked(converted);
                 }
             }
         });
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ impl Scalar {`
`72`	`72`	`let lscalar = self.as_list();`
`73`	`73`	`match lscalar.elements() {`
`74`	`74`	`None => {`
`75`		`- let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1);`
	`75`	`+ let mut list_view = ListViewVectorMut::with_capacity(elems_dtype, 1, 0);`
`76`	`76`	`list_view.append_nulls(1);`
`77`	`77`	`ListViewScalar::new(list_view.freeze()).into()`
`78`	`78`	`}`