add aligned bitpacked pipeline kernel

connortsui20 · connortsui20 · commit 07e35c81228f · 2025-11-12T11:51:45.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/encodings/fastlanes/Cargo.toml b/encodings/fastlanes/Cargo.toml
@@ -26,6 +26,7 @@ log = { workspace = true }
 num-traits = { workspace = true }
 prost = { workspace = true }
 rand = { workspace = true, optional = true }
+static_assertions = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }
 vortex-compute = { workspace = true }
diff --git a/encodings/fastlanes/src/bitpacking/array/bitpack_pipeline.rs b/encodings/fastlanes/src/bitpacking/array/bitpack_pipeline.rs
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use fastlanes::{BitPacking, FastLanes};
+use static_assertions::const_assert_eq;
+use vortex_array::pipeline::bit_view::BitView;
+use vortex_array::pipeline::{BindContext, KernelCtx, N, PipelinedNode};
+use vortex_array::pipeline::{Kernel, PipelineInputs};
+use vortex_buffer::Buffer;
+use vortex_dtype::{PTypeDowncastExt, PhysicalPType, match_each_integer_ptype};
+use vortex_error::VortexResult;
+use vortex_vector::primitive::PVectorMut;
+use vortex_vector::{VectorMut, VectorMutOps};
+
+use crate::BitPackedArray;
+
+/// The size of a FastLanes vector of elements.
+const FL_VECTOR_SIZE: usize = 1024;
+
+// Bitpacking uses FastLanes decompression, which expects a multiple of 1024 elements.
+const_assert_eq!(N, FL_VECTOR_SIZE);
+
+impl PipelinedNode for BitPackedArray {
+    fn inputs(&self) -> PipelineInputs {
+        PipelineInputs::Source
+    }
+
+    fn bind(&self, _ctx: &dyn BindContext) -> VortexResult<Box<dyn Kernel>> {
+        debug_assert!(self.bit_width > 0);
+
+        if self.patches.is_some() {
+            unimplemented!(
+                "We do not handle patches for bitpacked right now, as this will become a parent patch array"
+            );
+        }
+
+        match_each_integer_ptype!(self.ptype(), |T| {
+            let packed_bit_width = self.bit_width as usize;
+            let packed_buffer = Buffer::<<T as PhysicalPType>::Physical>::from_byte_buffer(
+                self.packed.clone().into_byte_buffer(),
+            );
+
+            // See the documentation for `AlignedBitPackedKernel` for more info on why we need this.
+            let packed_stride =
+                self.bit_width as usize * <<T as PhysicalPType>::Physical as FastLanes>::LANES;
+
+            if self.offset != 0 {
+                // TODO(ngates): the unaligned kernel needs fixing for the non-masked API
+                unimplemented!(
+                    "Unaligned `BitPackedArray` as a `PipelineSource` is not yet implemented"
+                )
+            }
+
+            Ok(Box::new(AlignedBitPackedKernel::<T>::new(
+                packed_bit_width,
+                packed_stride,
+                packed_buffer,
+            )) as Box<dyn Kernel>)
+        })
+    }
+}
+
+pub struct AlignedBitPackedKernel<BP: PhysicalPType<Physical: BitPacking>> {
+    /// The bit width of each bitpacked value.
+    ///
+    /// This is guaranteed to be less than or equal to the (unpacked) bit-width of `BP`.
+    packed_bit_width: usize,
+
+    /// The stride of the bitpacked values, which when fully unpacked will occupy exactly 1024 bits.
+    /// This is equal to `1024 * bit_width / size_of::<T>()`
+    ///
+    /// We store this here so that we do not have to keep calculating this in [`step()`].
+    ///
+    /// For example, if the `bit_width` is 10 and the physical type is `u16` (which will fill up
+    /// `1024 / 16 = 64` lanes), the `packed_stride` will be `10 * 64 = 640`. This ensures we pass
+    /// a slice with the correct length to [`BitPacking::unchecked_unpack`].
+    ///
+    /// [`step()`]: SourceKernel::step
+    /// [`BitPacking::unchecked_unpack()`]: BitPacking::unchecked_unpack
+    packed_stride: usize,
+
+    /// The buffer containing the bitpacked values.
+    packed_buffer: Buffer<BP::Physical>,
+
+    /// The total number of bitpacked chunks we have unpacked.
+    num_chunks_unpacked: usize,
+}
+
+impl<BP: PhysicalPType<Physical: BitPacking>> AlignedBitPackedKernel<BP> {
+    pub fn new(
+        packed_bit_width: usize,
+        packed_stride: usize,
+        packed_buffer: Buffer<BP::Physical>,
+    ) -> Self {
+        assert_eq!(
+            packed_stride,
+            FL_VECTOR_SIZE * packed_bit_width / size_of::<BP>()
+        );
+        assert!(packed_bit_width <= BP::Physical::T);
+
+        Self {
+            packed_bit_width,
+            packed_stride,
+            packed_buffer,
+            num_chunks_unpacked: 0,
+        }
+    }
+}
+
+impl<BP: PhysicalPType<Physical: BitPacking>> Kernel for AlignedBitPackedKernel<BP> {
+    fn step(
+        &mut self,
+        _ctx: &KernelCtx,
+        selection: &BitView,
+        out: &mut VectorMut,
+    ) -> VortexResult<()> {
+        let output_vector: &mut PVectorMut<BP::Physical> = out.as_primitive_mut().downcast();
+        debug_assert!(output_vector.is_empty());
+
+        let packed_offset = self.num_chunks_unpacked * self.packed_stride;
+        let not_yet_unpacked_values = &self.packed_buffer.as_slice()[packed_offset..];
+
+        let true_count = selection.true_count();
+
+        // If the true count is very small, we can unpack individual elements directly into the
+        // output vector.
+        if true_count < 7 {
+            output_vector.reserve(true_count);
+            debug_assert!(true_count <= output_vector.capacity());
+
+            // SAFETY: We have just reserved enough capacity for the vector to set the length, and
+            // we also are about to initialize all of the values **without** reading the memory.
+            unsafe { output_vector.set_len(true_count) };
+
+            selection.iter_ones(|idx| {
+                // SAFETY:
+                // - The documentation for `packed_bit_width` explains that the size is valid.
+                // - We know that the size of the `next_packed_chunk` we provide is equal to
+                //   `self.packed_stride`, and we explain why this is correct in its documentation.
+                let unpacked_value = unsafe {
+                    BitPacking::unchecked_unpack_single(
+                        self.packed_bit_width,
+                        not_yet_unpacked_values,
+                        idx,
+                    )
+                };
+
+                // SAFETY: We just reserved enough capacity to push these values.
+                unsafe { output_vector.push_unchecked(unpacked_value) };
+            });
+        } else {
+            // Otherwise, it is faster to fully unpack the entire 1024 element lane with SIMD /
+            // FastLanes and let other nodes in the pipeline decide if they want to perform the
+            // selection filter themselves.
+            output_vector.reserve(N);
+            debug_assert!(N <= output_vector.capacity());
+
+            // SAFETY: We have just reserved enough capacity for the vector to set the length, and
+            // we also are about to initialize all of the values **without** reading the memory.
+            unsafe { output_vector.set_len(N) };
+
+            let next_packed_chunk = &not_yet_unpacked_values[..self.packed_stride];
+            debug_assert_eq!(
+                next_packed_chunk.len(),
+                FL_VECTOR_SIZE * self.packed_bit_width / size_of::<BP::Physical>()
+            );
+
+            // SAFETY:
+            // - The documentation for `packed_bit_width` explains that the size is valid.
+            // - We know that the size of the `next_packed_chunk` we provide is equal to
+            //   `self.packed_stride`, and we explain why this is correct in its documentation.
+            // - It is clear that the output buffer has length 1024.
+            unsafe {
+                BitPacking::unchecked_unpack(
+                    self.packed_bit_width,
+                    next_packed_chunk,
+                    &mut output_vector.as_mut()[..FL_VECTOR_SIZE],
+                );
+            }
+        }
+
+        self.num_chunks_unpacked += 1;
+
+        Ok(())
+    }
+}
diff --git a/encodings/fastlanes/src/bitpacking/array/mod.rs b/encodings/fastlanes/src/bitpacking/array/mod.rs
@@ -14,6 +14,7 @@ use vortex_error::{VortexResult, vortex_bail, vortex_ensure};
 pub mod bitpack_compress;
 pub mod bitpack_decompress;
 pub mod unpack_iter;
+pub mod bitpack_pipeline;
 
 use crate::bitpack_compress::bitpack_encode;
 use crate::unpack_iter::{BitPacked, BitUnpackedChunks};
diff --git a/vortex-buffer/src/buffer_mut.rs b/vortex-buffer/src/buffer_mut.rs
@@ -245,8 +245,10 @@ impl<T> BufferMut<T> {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that there is sufficient capacity in the buffer and that the values
-    /// are valid up to `len`.
+    /// - `new_len` must be less than or equal to [`capacity()`].
+    /// - The elements at `old_len..new_len` must be initialized.
+    /// 
+    /// [`capacity()`]: Self::capacity
     #[inline]
     pub unsafe fn set_len(&mut self, len: usize) {
         debug_assert!(len <= self.capacity());
diff --git a/vortex-mask/src/mask_mut.rs b/vortex-mask/src/mask_mut.rs
@@ -98,7 +98,10 @@ impl MaskMut {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that `new_len` is less than the capacity of the mask.
+    /// - `new_len` must be less than or equal to [`capacity()`].
+    /// - The elements at `old_len..new_len` must be initialized.
+    /// 
+    /// [`capacity()`]: Self::capacity
     pub unsafe fn set_len(&mut self, new_len: usize) {
         debug_assert!(new_len < self.capacity());
         match &mut self.0 {
diff --git a/vortex-vector/src/primitive/generic_mut.rs b/vortex-vector/src/primitive/generic_mut.rs
@@ -77,7 +77,10 @@ impl<T> PVectorMut<T> {
     ///
     /// # Safety
     ///
-    /// The caller must ensure that the new length does not exceed the capacity of the vector.
+    /// - `new_len` must be less than or equal to [`capacity()`].
+    /// - The elements at `old_len..new_len` must be initialized.
+    /// 
+    /// [`capacity()`]: Self::capacity
     pub unsafe fn set_len(&mut self, new_len: usize) {
         debug_assert!(new_len < self.elements.capacity());
         debug_assert!(new_len < self.validity.capacity());