vortex-data
diff --git a/‎encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs‎
Lines changed: 2 additions & 2 deletions b/‎encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎encodings/fastlanes/src/bitpacking/compute/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎encodings/fastlanes/src/bitpacking/compute/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎encodings/fastlanes/src/bitpacking/compute/take.rs‎
Lines changed: 1 addition & 0 deletions b/‎encodings/fastlanes/src/bitpacking/compute/take.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎encodings/fastlanes/src/bitpacking/vtable/kernels/filter.rs‎
Lines changed: 206 additions & 0 deletions b/‎encodings/fastlanes/src/bitpacking/vtable/kernels/filter.rs‎
Lines changed: 206 additions & 0 deletions
diff --git a/‎encodings/fastlanes/src/bitpacking/vtable/kernels/mod.rs‎
Lines changed: 38 additions & 0 deletions b/‎encodings/fastlanes/src/bitpacking/vtable/kernels/mod.rs‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎encodings/fastlanes/src/bitpacking/vtable/mod.rs‎
Lines changed: 11 additions & 0 deletions b/‎encodings/fastlanes/src/bitpacking/vtable/mod.rs‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎encodings/fastlanes/src/for/vtable/rules.rs‎
Lines changed: 2 additions & 2 deletions b/‎encodings/fastlanes/src/for/vtable/rules.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎encodings/zstd/src/array.rs‎
Lines changed: 6 additions & 1 deletion b/‎encodings/zstd/src/array.rs‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎encodings/zstd/src/test.rs‎
Lines changed: 14 additions & 0 deletions b/‎encodings/zstd/src/test.rs‎
Lines changed: 14 additions & 0 deletions
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::IntoArray;
 use vortex_array::arrays::FilterArray;
@@ -38,8 +39,7 @@ impl ArrayParentReduceRule<DecimalBytePartsVTable> for DecimalBytePartsFilterPus
             return Ok(None);
         }
 
-        let new_msp =
-            FilterArray::new(child.msp.clone(), parent.filter_mask().clone()).into_array();
+        let new_msp = child.msp.filter(parent.filter_mask().clone())?;
         let new_child =
             DecimalBytePartsArray::try_new(new_msp, *child.decimal_dtype())?.into_array();
         Ok(Some(new_child))
 
@@ -7,6 +7,7 @@ mod filter;
 mod is_constant;
 mod take;
 
+// TODO(connor): This is duplicated in `encodings/fastlanes/src/bitpacking/kernels/mod.rs`.
 fn chunked_indices<F: FnMut(usize, &[usize])>(
     mut indices: impl Iterator<Item = usize>,
     offset: usize,
 
@@ -31,6 +31,7 @@ use crate::BitPackedArray;
 use crate::BitPackedVTable;
 use crate::bitpack_decompress;
 
+// TODO(connor): This is duplicated in `encodings/fastlanes/src/bitpacking/kernels/mod.rs`.
 /// assuming the buffer is already allocated (which will happen at most once) then unpacking
 /// all 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
 /// see https://github.com/vortex-data/vortex/pull/190#issue-2223752833
 
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::mem::MaybeUninit;
+use std::sync::Arc;
+
+use fastlanes::BitPacking;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::FilterArray;
+use vortex_array::arrays::FilterVTable;
+use vortex_array::kernel::ExecuteParentKernel;
+use vortex_array::kernel::ParentKernelSet;
+use vortex_array::matchers::Exact;
+use vortex_buffer::BufferMut;
+use vortex_compute::filter::Filter;
+use vortex_dtype::NativePType;
+use vortex_dtype::PType;
+use vortex_dtype::UnsignedPType;
+use vortex_dtype::match_each_integer_ptype;
+use vortex_error::VortexResult;
+use vortex_mask::Mask;
+use vortex_mask::MaskValues;
+use vortex_vector::Vector;
+use vortex_vector::VectorMutOps;
+use vortex_vector::primitive::PVectorMut;
+use vortex_vector::primitive::PrimitiveVectorMut;
+
+use crate::BitPackedArray;
+use crate::BitPackedVTable;
+use crate::bitpacking::vtable::kernels::UNPACK_CHUNK_THRESHOLD;
+use crate::bitpacking::vtable::kernels::chunked_indices;
+
+pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPackedVTable> =
+    ParentKernelSet::new(&[ParentKernelSet::lift(&BitPackingFilterKernel)]);
+
+/// The threshold over which it is faster to fully unpack the entire [`BitPackedArray`] and then
+/// filter the result than to unpack only specific bitpacked values into the output buffer.
+pub const fn unpack_then_filter_threshold<T>() -> f64 {
+    // TODO(connor): Where did these numbers come from? Add a public link after validating them.
+    // These numbers probably don't work for in-place filtering either.
+    match size_of::<T>() {
+        1 => 0.03,
+        2 => 0.03,
+        4 => 0.075,
+        _ => 0.09,
+        // >8 bytes may have a higher threshold. These numbers are derived from a GCP c2-standard-4
+        // with a "Cascade Lake" CPU.
+    }
+}
+
+/// Kernel to execute filtering directly on a bit-packed array.
+#[derive(Debug)]
+struct BitPackingFilterKernel;
+
+impl ExecuteParentKernel<BitPackedVTable> for BitPackingFilterKernel {
+    type Parent = Exact<FilterVTable>;
+
+    fn parent(&self) -> Self::Parent {
+        Exact::from(&FilterVTable)
+    }
+
+    fn execute_parent(
+        &self,
+        array: &BitPackedArray,
+        parent: &FilterArray,
+        _child_idx: usize,
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Vector>> {
+        let values = match parent.filter_mask() {
+            Mask::AllTrue(_) | Mask::AllFalse(_) => {
+                // No optimization for full or empty mask
+                return Ok(None);
+            }
+            Mask::Values(values) => values,
+        };
+
+        match_each_integer_ptype!(array.ptype(), |I| {
+            // If the density is high enough, then we would rather decompress the whole array and then apply
+            // a filter over decompressing values one by one.
+            if values.density() > unpack_then_filter_threshold::<I>() {
+                return Ok(None);
+            }
+        });
+
+        let mut primitive_vector: PrimitiveVectorMut = match array.ptype() {
+            PType::U8 => filter_primitive_without_patches::<u8>(array, values)?.into(),
+            PType::U16 => filter_primitive_without_patches::<u16>(array, values)?.into(),
+            PType::U32 => filter_primitive_without_patches::<u32>(array, values)?.into(),
+            PType::U64 => filter_primitive_without_patches::<u64>(array, values)?.into(),
+
+            // Since the fastlanes crate only supports unsigned integers, and since we know that all
+            // numbers are going to be non-negative, we can safely "cast" to unsigned and back.
+            PType::I8 => {
+                let pvector = filter_primitive_without_patches::<u8>(array, values)?;
+                unsafe { pvector.transmute::<i8>() }.into()
+            }
+            PType::I16 => {
+                let pvector = filter_primitive_without_patches::<u16>(array, values)?;
+                unsafe { pvector.transmute::<i16>() }.into()
+            }
+            PType::I32 => {
+                let pvector = filter_primitive_without_patches::<u32>(array, values)?;
+                unsafe { pvector.transmute::<i32>() }.into()
+            }
+            PType::I64 => {
+                let pvector = filter_primitive_without_patches::<u64>(array, values)?;
+                unsafe { pvector.transmute::<i64>() }.into()
+            }
+            other => {
+                unreachable!("Unsupported ptype {other} for bitpacking, we also checked this above")
+            }
+        };
+
+        // TODO(connor): We want a `PatchesArray` or patching compute functions instead of this.
+        let patches = array
+            .patches()
+            .map(|patches| patches.filter(&Mask::Values(values.clone())))
+            .transpose()?
+            .flatten();
+        if let Some(patches) = patches {
+            primitive_vector = patches.apply_to_primitive_vector(primitive_vector);
+        }
+
+        Ok(Some(primitive_vector.freeze().into()))
+    }
+}
+
+/// Specialized filter kernel for primitive bit-packed arrays.
+///
+/// Because the FastLanes bit-packing kernels are only implemented for unsigned types, the provided
+/// `U` should be promoted to the unsigned variant for any target bit width.
+/// For example, if the array is bit-packed `i16`, this function should be called with `U = u16`.
+///
+/// This function fully decompresses the array for all but the most selective masks because the
+/// FastLanes decompression is so fast and the bookkeepping necessary to decompress individual
+/// elements is relatively slow.
+fn filter_primitive_without_patches<U: UnsignedPType + BitPacking>(
+    array: &BitPackedArray,
+    selection: &Arc<MaskValues>,
+) -> VortexResult<PVectorMut<U>> {
+    let values = filter_with_indices(array, selection.indices());
+    let validity = array
+        .validity_mask()
+        .filter(&Mask::Values(selection.clone()))
+        .into_mut();
+
+    debug_assert_eq!(
+        values.len(),
+        validity.len(),
+        "`filter_with_indices` was somehow incorrect"
+    );
+
+    Ok(unsafe { PVectorMut::new_unchecked(values, validity) })
+}
+
+fn filter_with_indices<T: NativePType + BitPacking>(
+    array: &BitPackedArray,
+    indices: &[usize],
+) -> BufferMut<T> {
+    let offset = array.offset() as usize;
+    let bit_width = array.bit_width() as usize;
+    let mut values = BufferMut::with_capacity(indices.len());
+
+    // Some re-usable memory to store per-chunk indices.
+    let mut unpacked = [const { MaybeUninit::<T>::uninit() }; 1024];
+    let packed_bytes = array.packed_slice::<T>();
+
+    // Group the indices by the FastLanes chunk they belong to.
+    let chunk_size = 128 * bit_width / size_of::<T>();
+
+    chunked_indices(indices, offset, |chunk_idx, indices_within_chunk| {
+        let packed = &packed_bytes[chunk_idx * chunk_size..][..chunk_size];
+
+        if indices_within_chunk.len() == 1024 {
+            // Unpack the entire chunk.
+            unsafe {
+                let values_len = values.len();
+                values.set_len(values_len + 1024);
+                BitPacking::unchecked_unpack(
+                    bit_width,
+                    packed,
+                    &mut values.as_mut_slice()[values_len..],
+                );
+            }
+        } else if indices_within_chunk.len() > UNPACK_CHUNK_THRESHOLD {
+            // Unpack into a temporary chunk and then copy the values.
+            unsafe {
+                let dst: &mut [MaybeUninit<T>] = &mut unpacked;
+                let dst: &mut [T] = std::mem::transmute(dst);
+                BitPacking::unchecked_unpack(bit_width, packed, dst);
+            }
+            values.extend_trusted(
+                indices_within_chunk
+                    .iter()
+                    .map(|&idx| unsafe { unpacked.get_unchecked(idx).assume_init() }),
+            );
+        } else {
+            // Otherwise, unpack each element individually.
+            values.extend_trusted(indices_within_chunk.iter().map(|&idx| unsafe {
+                BitPacking::unchecked_unpack_single(bit_width, packed, idx)
+            }));
+        }
+    });
+
+    values
+}
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+pub(crate) mod filter;
+
+/// Assuming the buffer is already allocated (which will happen at most once), then unpacking all
+/// 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
+///
+/// See https://github.com/vortex-data/vortex/pull/190#issue-2223752833
+const UNPACK_CHUNK_THRESHOLD: usize = 8;
+
+fn chunked_indices<F: FnMut(usize, &[usize])>(indices: &[usize], offset: usize, mut chunk_fn: F) {
+    if indices.is_empty() {
+        return;
+    }
+
+    let mut indices_within_chunk: Vec<usize> = Vec::with_capacity(1024);
+
+    let first_idx = indices[0];
+    let mut current_chunk_idx = (first_idx + offset) / 1024;
+    indices_within_chunk.push((first_idx + offset) % 1024);
+
+    for idx in &indices[1..] {
+        let new_chunk_idx = (idx + offset) / 1024;
+
+        if new_chunk_idx != current_chunk_idx {
+            chunk_fn(current_chunk_idx, &indices_within_chunk);
+            indices_within_chunk.clear();
+        }
+
+        current_chunk_idx = new_chunk_idx;
+        indices_within_chunk.push((idx + offset) % 1024);
+    }
+
+    if !indices_within_chunk.is_empty() {
+        chunk_fn(current_chunk_idx, &indices_within_chunk);
+    }
+}
@@ -30,10 +30,12 @@ use vortex_vector::VectorMutOps;
 
 use crate::BitPackedArray;
 use crate::bitpack_decompress::unpack_to_primitive_vector;
+use crate::bitpacking::vtable::kernels::filter::PARENT_KERNELS;
 
 mod array;
 mod canonical;
 mod encode;
+mod kernels;
 mod operations;
 mod validity;
 mod visitor;
@@ -246,6 +248,15 @@ impl VTable for BitPackedVTable {
     fn execute(array: &Self::Array, _ctx: &mut ExecutionCtx) -> VortexResult<Vector> {
         Ok(unpack_to_primitive_vector(array).freeze().into())
     }
+
+    fn execute_parent(
+        array: &Self::Array,
+        parent: &ArrayRef,
+        child_idx: usize,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<Vector>> {
+        PARENT_KERNELS.execute(array, parent, child_idx, ctx)
+    }
 }
 
 #[derive(Debug)]
 
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::Array;
 use vortex_array::ArrayRef;
 use vortex_array::IntoArray;
 use vortex_array::arrays::FilterArray;
@@ -34,8 +35,7 @@ impl ArrayParentReduceRule<FoRVTable> for FoRFilterPushDownRule {
     ) -> VortexResult<Option<ArrayRef>> {
         let new_array = unsafe {
             FoRArray::new_unchecked(
-                FilterArray::new(child.encoded().clone(), parent.filter_mask().clone())
-                    .into_array(),
+                child.encoded.filter(parent.filter_mask().clone())?,
                 child.reference.clone(),
             )
         };
 
@@ -330,6 +330,7 @@ impl ZstdArray {
                 .get(i + 1)
                 .copied()
                 .unwrap_or(value_bytes.len());
+
             let uncompressed = &value_bytes.slice(frame_byte_starts[i]..frame_byte_end);
             let compressed = compressor
                 .compress(uncompressed)
@@ -366,8 +367,12 @@ impl ZstdArray {
         };
 
         let value_bytes = values.byte_buffer();
+        // Align frames to buffer alignment. This is necessary for overaligned buffers.
+        let alignment = *value_bytes.alignment();
+        let step_width = (values_per_frame * byte_width).div_ceil(alignment) * alignment;
+
         let frame_byte_starts = (0..n_values * byte_width)
-            .step_by(values_per_frame * byte_width)
+            .step_by(step_width)
             .collect::<Vec<_>>();
         let Frames {
             dictionary,
 
@@ -9,6 +9,7 @@ use vortex_array::arrays::VarBinViewArray;
 use vortex_array::assert_arrays_eq;
 use vortex_array::validity::Validity;
 use vortex_array::vtable::ValidityHelper;
+use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_dtype::DType;
 use vortex_dtype::Nullability;
@@ -202,3 +203,16 @@ fn test_sliced_array_children() {
     let sliced = compressed.slice(0..4);
     sliced.children();
 }
+
+/// Tests that each beginning of a frame in ZSTD matches
+/// the buffer alignment when compressing primitive arrays.
+#[test]
+fn test_zstd_frame_start_buffer_alignment() {
+    let data = vec![0u8; 2];
+    let aligned_buffer = Buffer::copy_from_aligned(&data, Alignment::new(8));
+    // u8 array now has a 8-byte alignment.
+    let array = PrimitiveArray::new(aligned_buffer, Validity::NonNullable);
+    let compressed = ZstdArray::from_primitive(&array, 0, 1);
+
+    assert!(compressed.is_ok());
+}