Wire up filter kernel

gatesn · gatesn · commit 79b571909a22 · 2025-12-16T09:58:11.000Z
Signed-off-by: Nicholas Gates &lt;nick@nickgates.com&gt;
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::IntoArray;
 use vortex_array::arrays::FilterArray;
@@ -38,8 +39,7 @@ impl ArrayParentReduceRule<DecimalBytePartsVTable> for DecimalBytePartsFilterPus
             return Ok(None);
         }
 
-        let new_msp =
-            FilterArray::new(child.msp.clone(), parent.filter_mask().clone()).into_array();
+        let new_msp = child.msp.filter(parent.filter_mask().clone())?;
         let new_child =
             DecimalBytePartsArray::try_new(new_msp, *child.decimal_dtype())?.into_array();
         Ok(Some(new_child))
diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs
@@ -8,7 +8,6 @@ pub use array::bitpack_decompress;
 pub use array::unpack_iter;
 
 mod compute;
-mod kernels;
 
 mod vtable;
 pub use vtable::BitPackedVTable;
diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels/filter.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels/filter.rs
@@ -2,36 +2,37 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::mem::MaybeUninit;
+use std::sync::Arc;
 
 use fastlanes::BitPacking;
 use vortex_array::ExecutionCtx;
-use vortex_array::IntoArray;
-use vortex_array::VectorExecutor;
 use vortex_array::arrays::FilterArray;
 use vortex_array::arrays::FilterVTable;
 use vortex_array::kernel::ExecuteParentKernel;
+use vortex_array::kernel::ParentKernelSet;
 use vortex_array::matchers::Exact;
-use vortex_array::patches::patch_pvector;
-use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
 use vortex_compute::filter::Filter;
 use vortex_dtype::NativePType;
 use vortex_dtype::PType;
 use vortex_dtype::UnsignedPType;
 use vortex_dtype::match_each_integer_ptype;
-use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_mask::Mask;
+use vortex_mask::MaskValues;
 use vortex_vector::Vector;
-use vortex_vector::VectorMut;
 use vortex_vector::VectorMutOps;
 use vortex_vector::primitive::PVector;
+use vortex_vector::primitive::PVectorMut;
 use vortex_vector::primitive::PrimitiveVector;
 
 use crate::BitPackedArray;
 use crate::BitPackedVTable;
-use crate::bitpacking::kernels::UNPACK_CHUNK_THRESHOLD;
-use crate::bitpacking::kernels::chunked_indices;
+use crate::bitpacking::vtable::kernels::UNPACK_CHUNK_THRESHOLD;
+use crate::bitpacking::vtable::kernels::chunked_indices;
+
+pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPackedVTable> =
+    ParentKernelSet::new(&[ParentKernelSet::lift(&BitPackingFilterKernel)]);
 
 /// The threshold over which it is faster to fully unpack the entire [`BitPackedArray`] and then
 /// filter the result than to unpack only specific bitpacked values into the output buffer.
@@ -48,6 +49,7 @@ pub const fn unpack_then_filter_threshold<T>() -> f64 {
     }
 }
 
+/// Kernel to execute filtering directly on a bit-packed array.
 #[derive(Debug)]
 struct BitPackingFilterKernel;
 
@@ -63,50 +65,47 @@ impl ExecuteParentKernel<BitPackedVTable> for BitPackingFilterKernel {
         array: &BitPackedArray,
         parent: &FilterArray,
         _child_idx: usize,
-        ctx: &mut ExecutionCtx,
+        _ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<Vector>> {
-        let selection = parent.filter_mask();
-
-        let true_count = selection.true_count();
-        if true_count == 0 {
-            // Fast-path for an empty mask.
-            return Ok(Some(VectorMut::with_capacity(array.dtype(), 0).freeze()));
-        } else if true_count == selection.len() {
-            // Fast-path for a full mask.
-            return Ok(Some(array.to_array().execute(ctx)?));
-        }
+        let values = match parent.filter_mask() {
+            Mask::AllTrue(_) | Mask::AllFalse(_) => {
+                // No optimization for full or empty mask
+                return Ok(None);
+            }
+            Mask::Values(values) => values,
+        };
 
         match_each_integer_ptype!(array.ptype(), |I| {
             // If the density is high enough, then we would rather decompress the whole array and then apply
             // a filter over decompressing values one by one.
-            if selection.density() > unpack_then_filter_threshold::<I>() {
+            if values.density() > unpack_then_filter_threshold::<I>() {
                 return Ok(None);
             }
         });
 
         let primitive_vector: PrimitiveVector = match array.ptype() {
-            PType::U8 => filter_primitive::<u8>(array, selection)?.into(),
-            PType::U16 => filter_primitive::<u16>(array, selection)?.into(),
-            PType::U32 => filter_primitive::<u32>(array, selection)?.into(),
-            PType::U64 => filter_primitive::<u64>(array, selection)?.into(),
+            PType::U8 => filter_primitive::<u8>(array, values)?.into(),
+            PType::U16 => filter_primitive::<u16>(array, values)?.into(),
+            PType::U32 => filter_primitive::<u32>(array, values)?.into(),
+            PType::U64 => filter_primitive::<u64>(array, values)?.into(),
 
             // Since the fastlanes crate only supports unsigned integers, and since we know that all
             // numbers are going to be non-negative, we can safely "cast" to unsigned and back.
             PType::I8 => {
-                let pvector = filter_primitive::<u8>(array, selection)?;
-                pvector.cast_into::<i8>().into()
+                let pvector = filter_primitive::<u8>(array, values)?;
+                unsafe { pvector.transmute::<i8>() }.into()
             }
             PType::I16 => {
-                let pvector = filter_primitive::<u16>(array, selection)?;
-                pvector.cast_into::<i16>().into()
+                let pvector = filter_primitive::<u16>(array, values)?;
+                unsafe { pvector.transmute::<i16>() }.into()
             }
             PType::I32 => {
-                let pvector = filter_primitive::<u32>(array, selection)?;
-                pvector.cast_into::<i32>().into()
+                let pvector = filter_primitive::<u32>(array, values)?;
+                unsafe { pvector.transmute::<i32>() }.into()
             }
             PType::I64 => {
-                let pvector = filter_primitive::<u64>(array, selection)?;
-                pvector.cast_into::<i64>().into()
+                let pvector = filter_primitive::<u64>(array, values)?;
+                unsafe { pvector.transmute::<i64>() }.into()
             }
             other => {
                 unreachable!("Unsupported ptype {other} for bitpacking, we also checked this above")
@@ -128,42 +127,39 @@ impl ExecuteParentKernel<BitPackedVTable> for BitPackingFilterKernel {
 /// elements is relatively slow.
 fn filter_primitive<U: UnsignedPType + BitPacking>(
     array: &BitPackedArray,
-    selection: &Mask,
+    selection: &Arc<MaskValues>,
 ) -> VortexResult<PVector<U>> {
-    let values = filter_with_indices(
-        array,
-        selection
-            .values()
-            .vortex_expect("AllTrue and AllFalse handled by filter fn")
-            .indices(),
-    );
-    let validity = array.validity_mask().filter(selection);
+    let values = filter_with_indices(array, selection.indices());
+    let validity = array
+        .validity_mask()
+        .filter(&Mask::Values(selection.clone()))
+        .into_mut();
 
     debug_assert_eq!(
         values.len(),
         validity.len(),
         "`filter_with_indices` was somehow incorrect"
     );
 
-    let mut pvector = unsafe { PVector::new_unchecked(values, validity) };
+    let mut pvector = unsafe { PVectorMut::new_unchecked(values, validity) };
 
     // TODO(connor): We want a `PatchesArray` or patching compute functions instead of this.
     let patches = array
         .patches()
-        .map(|patches| patches.filter(selection))
+        .map(|patches| patches.filter(&Mask::Values(selection.clone())))
         .transpose()?
         .flatten();
     if let Some(patches) = patches {
-        pvector = patch_pvector(pvector, &patches);
+        pvector = patches.apply_to_pvector(pvector);
     }
 
-    Ok(pvector)
+    Ok(pvector.freeze())
 }
 
 fn filter_with_indices<T: NativePType + BitPacking>(
     array: &BitPackedArray,
     indices: &[usize],
-) -> Buffer<T> {
+) -> BufferMut<T> {
     let offset = array.offset() as usize;
     let bit_width = array.bit_width() as usize;
     let mut values = BufferMut::with_capacity(indices.len());
@@ -209,5 +205,5 @@ fn filter_with_indices<T: NativePType + BitPacking>(
         }
     });
 
-    values.freeze()
+    values
 }
diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels/mod.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels/mod.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-mod filter;
+pub(crate) mod filter;
 
 /// Assuming the buffer is already allocated (which will happen at most once), then unpacking all
 /// 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
diff --git a/encodings/fastlanes/src/bitpacking/vtable/mod.rs b/encodings/fastlanes/src/bitpacking/vtable/mod.rs
@@ -30,10 +30,12 @@ use vortex_vector::VectorMutOps;
 
 use crate::BitPackedArray;
 use crate::bitpack_decompress::unpack_to_primitive_vector;
+use crate::bitpacking::vtable::kernels::filter::PARENT_KERNELS;
 
 mod array;
 mod canonical;
 mod encode;
+mod kernels;
 mod operations;
 mod validity;
 mod visitor;
@@ -246,6 +248,15 @@ impl VTable for BitPackedVTable {
     fn execute(array: &Self::Array, _ctx: &mut ExecutionCtx) -> VortexResult<Vector> {
         Ok(unpack_to_primitive_vector(array).freeze().into())
     }
+
+    fn execute_parent(
+        array: &Self::Array,
+        parent: &ArrayRef,
+        child_idx: usize,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Vector>> {
+        PARENT_KERNELS.execute(array, parent, child_idx, ctx)
+    }
 }
 
 #[derive(Debug)]
diff --git a/encodings/fastlanes/src/for/vtable/rules.rs b/encodings/fastlanes/src/for/vtable/rules.rs
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::IntoArray;
 use vortex_array::arrays::FilterArray;
@@ -34,8 +35,7 @@ impl ArrayParentReduceRule<FoRVTable> for FoRFilterPushDownRule {
     ) -> VortexResult<Option<ArrayRef>> {
         let new_array = unsafe {
             FoRArray::new_unchecked(
-                FilterArray::new(child.encoded().clone(), parent.filter_mask().clone())
-                    .into_array(),
+                child.encoded.filter(parent.filter_mask().clone())?,
                 child.reference.clone(),
             )
         };
diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs
@@ -30,8 +30,7 @@ use vortex_mask::MaskMut;
 use vortex_scalar::PValue;
 use vortex_scalar::Scalar;
 use vortex_utils::aliases::hash_map::HashMap;
-use vortex_vector::VectorOps;
-use vortex_vector::primitive::PVector;
+use vortex_vector::primitive::PVectorMut;
 
 use crate::Array;
 use crate::ArrayRef;
@@ -824,6 +823,21 @@ impl Patches {
         }))
     }
 
+    /// Applies patches to a [`PVector<T>`], returning the patched vector.
+    ///
+    /// This function modifies the elements buffer in-place at the positions specified by the patch
+    /// indices. It also updates the validity mask to reflect the nullability of patch values.
+    pub fn apply_to_pvector<T: NativePType>(&self, pvector: PVectorMut<T>) -> PVectorMut<T> {
+        let (mut elements, mut validity) = pvector.into_parts();
+
+        // SAFETY: We maintain the invariant that elements and validity have the same length, and all
+        // patch indices are valid after offset adjustment (guaranteed by `Patches`).
+        unsafe { self.apply_to_buffer(elements.as_mut_slice(), &mut validity) };
+
+        // SAFETY: We have not modified the length of elements or validity.
+        unsafe { PVectorMut::new_unchecked(elements, validity) }
+    }
+
     /// Apply patches to a mutable buffer and validity mask.
     ///
     /// This method applies the patch values to the given buffer at the positions specified by the
@@ -885,20 +899,6 @@ impl Patches {
     }
 }
 
-/// Applies patches to a [`PVector<T>`], returning the patched vector.
-///
-/// This function modifies the elements buffer in-place at the positions specified by the patch
-/// indices. It also updates the validity mask to reflect the nullability of patch values.
-pub fn patch_pvector<T: NativePType>(pvector: PVector<T>, patches: &Patches) -> PVector<T> {
-    let (mut elements, mut validity) = pvector.into_mut().into_parts();
-
-    // SAFETY: We maintain the invariant that elements and validity have the same length, and all
-    // patch indices are valid after offset adjustment (guaranteed by `Patches`).
-    unsafe { patches.apply_to_buffer(elements.as_mut_slice(), &mut validity) };
-
-    PVector::new(elements.freeze(), validity.freeze())
-}
-
 /// Helper function to apply patches to a buffer.
 ///
 /// # Safety
diff --git a/vortex-buffer/src/buffer.rs b/vortex-buffer/src/buffer.rs
@@ -466,13 +466,19 @@ impl<T> Buffer<T> {
     }
 }
 
-impl<T: Copy> Buffer<T> {
-    /// Cast a `Buffer<T>` into a `Buffer<U>`.
+impl<T> Buffer<T> {
+    /// Transmute a `Buffer<T>` into a `Buffer<U>`.
+    ///
+    /// # Safety
+    ///
+    /// The caller must ensure that all possible bit representations of type `T` are valid when
+    /// interpreted as type `U`.
+    /// See [`std::mem::transmute`] for more details.
     ///
     /// # Panics
     ///
     /// Panics if the type `U` does not have the same size and alignment as `T`.
-    pub fn cast_into<U>(self) -> Buffer<U> {
+    pub unsafe fn transmute<U>(self) -> Buffer<U> {
         assert_eq!(size_of::<T>(), size_of::<U>(), "Buffer type size mismatch");
         assert_eq!(
             align_of::<T>(),
diff --git a/vortex-compute/src/take/slice/avx2.rs b/vortex-compute/src/take/slice/avx2.rs
@@ -72,7 +72,7 @@ pub unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(
             let values = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };
 
             let result = exec_take::<$cast, $indices, AVX2Gather>(values, indices);
-            result.cast_into::<V>()
+            result.transmute::<V>()
         }};
     }
 
diff --git a/vortex-compute/src/take/slice/portable.rs b/vortex-compute/src/take/slice/portable.rs
@@ -40,7 +40,7 @@ pub fn take_portable<T: NativePType, I: UnsignedPType>(buffer: &[T], indices: &[
         // make.
         let u16_slice: &[u16] =
             unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const u16, buffer.len()) };
-        return take_with_indices(u16_slice, indices).cast_into::<T>();
+        return take_with_indices(u16_slice, indices).transmute::<T>();
     }
 
     match_each_native_simd_ptype!(T::PTYPE, |TC| {
diff --git a/vortex-mask/src/lib.rs b/vortex-mask/src/lib.rs
@@ -642,6 +642,12 @@ impl MaskValues {
         self.buffer.is_empty()
     }
 
+    /// Returns the density of the mask.
+    #[inline]
+    pub fn density(&self) -> f64 {
+        self.density
+    }
+
     /// Returns the true count of the mask.
     #[inline]
     pub fn true_count(&self) -> usize {
diff --git a/vortex-vector/src/primitive/generic.rs b/vortex-vector/src/primitive/generic.rs

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ pub unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(`
`72`	`72`	`let values = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };`
`73`	`73`
`74`	`74`	`let result = exec_take::<$cast, $indices, AVX2Gather>(values, indices);`
`75`		`- result.cast_into::<V>()`
	`75`	`+ result.transmute::<V>()`
`76`	`76`	`}};`
`77`	`77`	`}`
`78`	`78`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ pub fn take_portable<T: NativePType, I: UnsignedPType>(buffer: &[T], indices: &[`
`40`	`40`	`// make.`
`41`	`41`	`let u16_slice: &[u16] =`
`42`	`42`	`unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const u16, buffer.len()) };`
`43`		`- return take_with_indices(u16_slice, indices).cast_into::<T>();`
	`43`	`+ return take_with_indices(u16_slice, indices).transmute::<T>();`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`match_each_native_simd_ptype!(T::PTYPE, \|TC\| {`