add fused bitpack filter kernel

connortsui20 · connortsui20 · commit 222cd71b368b · 2025-12-15T17:03:11.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/encodings/fastlanes/src/bitpacking/compute/mod.rs b/encodings/fastlanes/src/bitpacking/compute/mod.rs
@@ -7,6 +7,7 @@ mod filter;
 mod is_constant;
 mod take;
 
+// TODO(connor): This is duplicated in `encodings/fastlanes/src/bitpacking/kernels/mod.rs`.
 fn chunked_indices<F: FnMut(usize, &[usize])>(
     mut indices: impl Iterator<Item = usize>,
     offset: usize,
diff --git a/encodings/fastlanes/src/bitpacking/compute/take.rs b/encodings/fastlanes/src/bitpacking/compute/take.rs
@@ -31,6 +31,7 @@ use crate::BitPackedArray;
 use crate::BitPackedVTable;
 use crate::bitpack_decompress;
 
+// TODO(connor): This is duplicated in `encodings/fastlanes/src/bitpacking/kernels/mod.rs`.
 /// assuming the buffer is already allocated (which will happen at most once) then unpacking
 /// all 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
 /// see https://github.com/vortex-data/vortex/pull/190#issue-2223752833
diff --git a/encodings/fastlanes/src/bitpacking/kernels/filter.rs b/encodings/fastlanes/src/bitpacking/kernels/filter.rs
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::mem::MaybeUninit;
+
+use fastlanes::BitPacking;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::VectorExecutor;
+use vortex_array::arrays::FilterArray;
+use vortex_array::arrays::FilterVTable;
+use vortex_array::kernel::ExecuteParentKernel;
+use vortex_array::matchers::Exact;
+use vortex_array::patches::patch_pvector;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_compute::filter::Filter;
+use vortex_dtype::NativePType;
+use vortex_dtype::PType;
+use vortex_dtype::UnsignedPType;
+use vortex_dtype::match_each_integer_ptype;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_mask::Mask;
+use vortex_vector::Vector;
+use vortex_vector::VectorMut;
+use vortex_vector::VectorMutOps;
+use vortex_vector::primitive::PVector;
+use vortex_vector::primitive::PrimitiveVector;
+
+use crate::BitPackedArray;
+use crate::BitPackedVTable;
+use crate::bitpacking::kernels::UNPACK_CHUNK_THRESHOLD;
+use crate::bitpacking::kernels::chunked_indices;
+
+/// The threshold over which it is faster to fully unpack the entire [`BitPackedArray`] and then
+/// filter the result than to unpack only specific bitpacked values into the output buffer.
+pub const fn unpack_then_filter_threshold<T>() -> f64 {
+    // TODO(connor): Where did these numbers come from? Add a public link after validating them.
+    // These numbers probably don't work for in-place filtering either.
+    match size_of::<T>() {
+        1 => 0.03,
+        2 => 0.03,
+        4 => 0.075,
+        _ => 0.09,
+        // >8 bytes may have a higher threshold. These numbers are derived from a GCP c2-standard-4
+        // with a "Cascade Lake" CPU.
+    }
+}
+
+#[derive(Debug)]
+struct BitPackingFilterKernel;
+
+impl ExecuteParentKernel<BitPackedVTable> for BitPackingFilterKernel {
+    type Parent = Exact<FilterVTable>;
+
+    fn parent(&self) -> Self::Parent {
+        Exact::from(&FilterVTable)
+    }
+
+    fn execute_parent(
+        &self,
+        array: &BitPackedArray,
+        parent: &FilterArray,
+        _child_idx: usize,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Vector>> {
+        let selection = parent.filter_mask();
+
+        let true_count = selection.true_count();
+        if true_count == 0 {
+            // Fast-path for an empty mask.
+            return Ok(Some(VectorMut::with_capacity(array.dtype(), 0).freeze()));
+        } else if true_count == selection.len() {
+            // Fast-path for a full mask.
+            return Ok(Some(array.to_array().execute(ctx)?));
+        }
+
+        match_each_integer_ptype!(array.ptype(), |I| {
+            // If the density is high enough, then we would rather decompress the whole array and then apply
+            // a filter over decompressing values one by one.
+            if selection.density() > unpack_then_filter_threshold::<I>() {
+                return Ok(None);
+            }
+        });
+
+        let primitive_vector: PrimitiveVector = match array.ptype() {
+            PType::U8 => filter_primitive::<u8>(array, selection)?.into(),
+            PType::U16 => filter_primitive::<u16>(array, selection)?.into(),
+            PType::U32 => filter_primitive::<u32>(array, selection)?.into(),
+            PType::U64 => filter_primitive::<u64>(array, selection)?.into(),
+
+            // Since the fastlanes crate only supports unsigned integers, and since we know that all
+            // numbers are going to be non-negative, we can safely "cast" to unsigned and back.
+            PType::I8 => {
+                let pvector = filter_primitive::<u8>(array, selection)?;
+                pvector.cast_into::<i8>().into()
+            }
+            PType::I16 => {
+                let pvector = filter_primitive::<u16>(array, selection)?;
+                pvector.cast_into::<i16>().into()
+            }
+            PType::I32 => {
+                let pvector = filter_primitive::<u32>(array, selection)?;
+                pvector.cast_into::<i32>().into()
+            }
+            PType::I64 => {
+                let pvector = filter_primitive::<u64>(array, selection)?;
+                pvector.cast_into::<i64>().into()
+            }
+            other => {
+                unreachable!("Unsupported ptype {other} for bitpacking, we also checked this above")
+            }
+        };
+
+        Ok(Some(primitive_vector.into()))
+    }
+}
+
+/// Specialized filter kernel for primitive bit-packed arrays.
+///
+/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
+/// `U` should be promoted to the unsigned variant for any target bit width.
+/// For example, if the array is bit-packed `i16`, this function should be called with `U = u16`.
+///
+/// This function fully decompresses the array for all but the most selective masks because the
+/// FastLanes decompression is so fast and the bookkeepping necessary to decompress individual
+/// elements is relatively slow.
+fn filter_primitive<U: UnsignedPType + BitPacking>(
+    array: &BitPackedArray,
+    selection: &Mask,
+) -> VortexResult<PVector<U>> {
+    let values = filter_with_indices(
+        array,
+        selection
+            .values()
+            .vortex_expect("AllTrue and AllFalse handled by filter fn")
+            .indices(),
+    );
+    let validity = array.validity_mask().filter(selection);
+
+    debug_assert_eq!(
+        values.len(),
+        validity.len(),
+        "`filter_with_indices` was somehow incorrect"
+    );
+
+    let mut pvector = unsafe { PVector::new_unchecked(values, validity) };
+
+    // TODO(connor): We want a `PatchesArray` or patching compute functions instead of this.
+    let patches = array
+        .patches()
+        .map(|patches| patches.filter(selection))
+        .transpose()?
+        .flatten();
+    if let Some(patches) = patches {
+        pvector = patch_pvector(pvector, &patches);
+    }
+
+    Ok(pvector)
+}
+
+fn filter_with_indices<T: NativePType + BitPacking>(
+    array: &BitPackedArray,
+    indices: &[usize],
+) -> Buffer<T> {
+    let offset = array.offset() as usize;
+    let bit_width = array.bit_width() as usize;
+    let mut values = BufferMut::with_capacity(indices.len());
+
+    // Some re-usable memory to store per-chunk indices.
+    let mut unpacked = [const { MaybeUninit::<T>::uninit() }; 1024];
+    let packed_bytes = array.packed_slice::<T>();
+
+    // Group the indices by the FastLanes chunk they belong to.
+    let chunk_size = 128 * bit_width / size_of::<T>();
+
+    chunked_indices(indices, offset, |chunk_idx, indices_within_chunk| {
+        let packed = &packed_bytes[chunk_idx * chunk_size..][..chunk_size];
+
+        if indices_within_chunk.len() == 1024 {
+            // Unpack the entire chunk.
+            unsafe {
+                let values_len = values.len();
+                values.set_len(values_len + 1024);
+                BitPacking::unchecked_unpack(
+                    bit_width,
+                    packed,
+                    &mut values.as_mut_slice()[values_len..],
+                );
+            }
+        } else if indices_within_chunk.len() > UNPACK_CHUNK_THRESHOLD {
+            // Unpack into a temporary chunk and then copy the values.
+            unsafe {
+                let dst: &mut [MaybeUninit<T>] = &mut unpacked;
+                let dst: &mut [T] = std::mem::transmute(dst);
+                BitPacking::unchecked_unpack(bit_width, packed, dst);
+            }
+            values.extend_trusted(
+                indices_within_chunk
+                    .iter()
+                    .map(|&idx| unsafe { unpacked.get_unchecked(idx).assume_init() }),
+            );
+        } else {
+            // Otherwise, unpack each element individually.
+            values.extend_trusted(indices_within_chunk.iter().map(|&idx| unsafe {
+                BitPacking::unchecked_unpack_single(bit_width, packed, idx)
+            }));
+        }
+    });
+
+    values.freeze()
+}
diff --git a/encodings/fastlanes/src/bitpacking/kernels/mod.rs b/encodings/fastlanes/src/bitpacking/kernels/mod.rs
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod filter;
+
+/// Assuming the buffer is already allocated (which will happen at most once), then unpacking all
+/// 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
+///
+/// See https://github.com/vortex-data/vortex/pull/190#issue-2223752833
+const UNPACK_CHUNK_THRESHOLD: usize = 8;
+
+fn chunked_indices<F: FnMut(usize, &[usize])>(indices: &[usize], offset: usize, mut chunk_fn: F) {
+    if indices.is_empty() {
+        return;
+    }
+
+    let mut indices_within_chunk: Vec<usize> = Vec::with_capacity(1024);
+
+    let first_idx = indices[0];
+    let mut current_chunk_idx = (first_idx + offset) / 1024;
+    indices_within_chunk.push((first_idx + offset) % 1024);
+
+    for idx in &indices[1..] {
+        let new_chunk_idx = (idx + offset) / 1024;
+
+        if new_chunk_idx != current_chunk_idx {
+            chunk_fn(current_chunk_idx, &indices_within_chunk);
+            indices_within_chunk.clear();
+        }
+
+        current_chunk_idx = new_chunk_idx;
+        indices_within_chunk.push((idx + offset) % 1024);
+    }
+
+    if !indices_within_chunk.is_empty() {
+        chunk_fn(current_chunk_idx, &indices_within_chunk);
+    }
+}
diff --git a/encodings/fastlanes/src/bitpacking/mod.rs b/encodings/fastlanes/src/bitpacking/mod.rs
@@ -8,6 +8,7 @@ pub use array::bitpack_decompress;
 pub use array::unpack_iter;
 
 mod compute;
+mod kernels;
 
 mod vtable;
 pub use vtable::BitPackedVTable;
diff --git a/vortex-array/src/patches.rs b/vortex-array/src/patches.rs
@@ -30,6 +30,8 @@ use vortex_mask::MaskMut;
 use vortex_scalar::PValue;
 use vortex_scalar::Scalar;
 use vortex_utils::aliases::hash_map::HashMap;
+use vortex_vector::VectorOps;
+use vortex_vector::primitive::PVector;
 
 use crate::Array;
 use crate::ArrayRef;
@@ -883,6 +885,20 @@ impl Patches {
     }
 }
 
+/// Applies patches to a [`PVector<T>`], returning the patched vector.
+///
+/// This function modifies the elements buffer in-place at the positions specified by the patch
+/// indices. It also updates the validity mask to reflect the nullability of patch values.
+pub fn patch_pvector<T: NativePType>(pvector: PVector<T>, patches: &Patches) -> PVector<T> {
+    let (mut elements, mut validity) = pvector.into_mut().into_parts();
+
+    // SAFETY: We maintain the invariant that elements and validity have the same length, and all
+    // patch indices are valid after offset adjustment (guaranteed by `Patches`).
+    unsafe { patches.apply_to_buffer(elements.as_mut_slice(), &mut validity) };
+
+    PVector::new(elements.freeze(), validity.freeze())
+}
+
 /// Helper function to apply patches to a buffer.
 ///
 /// # Safety
diff --git a/vortex-buffer/src/buffer.rs b/vortex-buffer/src/buffer.rs
@@ -409,27 +409,6 @@ impl<T> Buffer<T> {
         }
     }
 
-    /// Cast a `Buffer<T>` into a `Buffer<U>`.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the type `U` does not have the same size and alignment as `T`.
-    pub fn cast_into<U>(self) -> Buffer<U> {
-        assert_eq!(size_of::<T>(), size_of::<U>(), "Buffer type size mismatch");
-        assert_eq!(
-            align_of::<T>(),
-            align_of::<U>(),
-            "Buffer type alignment mismatch"
-        );
-
-        Buffer {
-            bytes: self.bytes,
-            length: self.length,
-            alignment: self.alignment,
-            _marker: PhantomData,
-        }
-    }
-
     /// Try to convert self into `BufferMut<T>` if there is only a single strong reference.
     pub fn try_into_mut(self) -> Result<BufferMut<T>, Self> {
         self.bytes
@@ -487,6 +466,29 @@ impl<T> Buffer<T> {
     }
 }
 
+impl<T: Copy> Buffer<T> {
+    /// Cast a `Buffer<T>` into a `Buffer<U>`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the type `U` does not have the same size and alignment as `T`.
+    pub fn cast_into<U>(self) -> Buffer<U> {
+        assert_eq!(size_of::<T>(), size_of::<U>(), "Buffer type size mismatch");
+        assert_eq!(
+            align_of::<T>(),
+            align_of::<U>(),
+            "Buffer type alignment mismatch"
+        );
+
+        Buffer {
+            bytes: self.bytes,
+            length: self.length,
+            alignment: self.alignment,
+            _marker: PhantomData,
+        }
+    }
+}
+
 /// An iterator over Buffer elements.
 ///
 /// This is an analog to the `std::slice::Iter` type.
diff --git a/vortex-vector/src/primitive/generic.rs b/vortex-vector/src/primitive/generic.rs