vortex-data
diff --git a/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎vortex-compute/Cargo.toml‎
Lines changed: 6 additions & 0 deletions b/‎vortex-compute/Cargo.toml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎vortex-compute/benches/avx512.rs‎
Lines changed: 93 additions & 0 deletions b/‎vortex-compute/benches/avx512.rs‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎vortex-compute/src/filter/mod.rs‎
Lines changed: 1 addition & 1 deletion b/‎vortex-compute/src/filter/mod.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vortex-compute/src/filter/slice/in_place/avx512.rs‎
Lines changed: 117 additions & 0 deletions b/‎vortex-compute/src/filter/slice/in_place/avx512.rs‎
Lines changed: 117 additions & 0 deletions
@@ -37,6 +37,8 @@ arrow = ["dep:arrow-array", "dep:arrow-buffer", "dep:arrow-schema"]
 
 [dev-dependencies]
 divan = { workspace = true }
+itertools = { workspace = true }
+rand = { workspace = true }
 
 [[bench]]
 name = "filter_buffer_mut"
@@ -45,3 +47,7 @@ harness = false
 [[bench]]
 name = "expand_buffer"
 harness = false
+
+[[bench]]
+name = "avx512"
+harness = false
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(clippy::cast_possible_truncation)]
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use filter_in_place::filter_in_place_avx512;
+use itertools::Itertools;
+use rand::Rng;
+use vortex_compute::filter::slice::in_place::filter_in_place_scalar;
+
+fn main() {
+    divan::main();
+}
+
+// Create a random mask where each bit has `probability` chance of being set.
+fn create_random_mask(size: usize, probability: f64) -> Vec<u8> {
+    let mut rng = rand::rng();
+    let num_bytes = size.div_ceil(8);
+    let mut mask = Vec::with_capacity(num_bytes);
+
+    for _ in 0..num_bytes {
+        let mut byte = 0u8;
+        for bit in 0..8 {
+            if rng.random::<f64>() < probability {
+                byte |= 1 << bit;
+            }
+        }
+        mask.push(byte);
+    }
+
+    mask
+}
+
+// Benchmark different data sizes.
+const SIZES: &[usize] = &[1 << 10, 1 << 14, 1 << 17];
+
+// Different probability values to benchmark.
+const PROBABILITIES: &[f64] = &[0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0];
+
+#[divan::bench_group]
+mod filter_scalar_i32 {
+    use super::*;
+
+    #[divan::bench(sample_size = 64, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+    fn random_probability(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+        let mask = create_random_mask(size, probability);
+        bencher
+            .with_inputs(|| (0..size as i32).collect::<Vec<_>>())
+            .bench_values(|mut data| filter_in_place_scalar(&mut data, &mask))
+    }
+}
+
+#[divan::bench_group]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod filter_avx512_i32 {
+    use super::*;
+
+    #[divan::bench(sample_size = 64, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+    fn random_probability(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+        let mask = create_random_mask(size, probability);
+        bencher
+            .with_inputs(|| (0..size as i32).collect::<Vec<_>>())
+            .bench_values(|mut data| unsafe { filter_in_place_avx512(&mut data, &mask) })
+    }
+}
+
+// Throughput benchmark - measure GB/s
+#[divan::bench_group]
+mod throughput {
+    use super::*;
+
+    const LARGE_SIZE: usize = 1024 * 1024; // 4 MB
+
+    #[divan::bench(sample_size = 16, args = PROBABILITIES)]
+    fn scalar_throughput(bencher: divan::Bencher, probability: f64) {
+        let mask = create_random_mask(LARGE_SIZE, probability);
+        bencher
+            .counter(divan::counter::BytesCount::new(LARGE_SIZE * 4))
+            .with_inputs(|| (0..LARGE_SIZE as i32).collect::<Vec<_>>())
+            .bench_values(|mut data| filter_in_place_scalar(&mut data, &mask))
+    }
+
+    #[divan::bench(sample_size = 16, args = PROBABILITIES)]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn avx512_throughput(bencher: divan::Bencher, probability: f64) {
+        let mask = create_random_mask(LARGE_SIZE, probability);
+        bencher
+            .counter(divan::counter::BytesCount::new(LARGE_SIZE * 4))
+            .with_inputs(|| (0..LARGE_SIZE as i32).collect::<Vec<_>>())
+            .bench_values(|mut data| unsafe { filter_in_place_avx512(&mut data, &mask) })
+    }
+}
@@ -6,7 +6,7 @@
 mod bitbuffer;
 mod buffer;
 mod mask;
-mod slice_mut;
+pub mod slice;
 mod vector;
 
 /// Function for filtering based on a selection mask.
 
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use std::arch::x86_64::*;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::filter::slice::SimdCompress;
+use crate::filter::slice::in_place::filter_in_place_scalar;
+
+/// Filter a mutable slice of elements in-place depending on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask.
+///
+/// This function automatically dispatches to the most efficient implementation based on the
+/// available CPU features at compile time.
+///
+/// # Panics
+///
+/// Panics if `mask.len() != data.len().div_ceil(8)`.
+#[inline]
+pub fn filter_in_place<T: SimdCompress>(data: &mut [T], mask: &[u8]) -> usize {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        let use_simd = if T::WIDTH >= 32 {
+            // 32-bit and 64-bit types only need AVX-512F.
+            is_x86_feature_detected!("avx512f")
+        } else {
+            // 8-bit and 16-bit types need both AVX-512F and AVX-512VBMI2.
+            is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vbmi2")
+        };
+
+        if use_simd {
+            return unsafe { filter_in_place_avx512(data, mask) };
+        }
+    }
+
+    // Fall back to scalar implementation for non-x86 or when SIMD not available.
+    filter_in_place_scalar(data, mask)
+}
+
+/// Filter a mutable slice of elements in-place depending on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask.
+///
+/// This function uses AVX-512 SIMD instructions for high-performance filtering.
+///
+/// # Panics
+///
+/// Panics if `mask.len() != data.len().div_ceil(8)`.
+///
+/// # Safety
+///
+/// This function requires the appropriate SIMD instruction set to be available.
+/// For AVX-512F types, the CPU must support AVX-512F.
+/// For AVX-512VBMI2 types, the CPU must support AVX-512VBMI2.
+#[inline]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx512f,avx512vbmi2,popcnt")]
+pub unsafe fn filter_in_place_avx512<T: SimdCompress>(data: &mut [T], mask: &[u8]) -> usize {
+    assert_eq!(
+        mask.len(),
+        data.len().div_ceil(8),
+        "Mask length must be data.len().div_ceil(8)"
+    );
+
+    let data_len = data.len();
+    let mut write_pos = 0;
+
+    // Pre-calculate loop bounds to eliminate branch misprediction in the hot loop.
+    let full_chunks = data_len / T::ELEMENTS_PER_VECTOR;
+    let remainder = data_len % T::ELEMENTS_PER_VECTOR;
+
+    // Process full chunks with no branches in the loop.
+    for chunk_idx in 0..full_chunks {
+        let read_pos = chunk_idx * T::ELEMENTS_PER_VECTOR;
+        let mask_byte_offset = chunk_idx * T::MASK_BYTES;
+
+        // Read the mask for this chunk.
+        // SAFETY: `mask_byte_offset + T::MASK_BYTES <= mask.len()` for all full chunks.
+        let mask_value = unsafe { T::read_mask(mask.as_ptr(), mask_byte_offset) };
+
+        // Load elements into the SIMD register.
+        // SAFETY: `read_pos + T::ELEMENTS_PER_VECTOR <= data.len()` for all full chunks.
+        let vector = unsafe { _mm512_loadu_si512(data.as_ptr().add(read_pos) as *const __m512i) };
+
+        // Moves all elements that have their bit set to 1 in the mask value to the left.
+        let filtered = unsafe { T::compress_vector(mask_value, vector) };
+
+        // Write the filtered result vector back to memory.
+        // SAFETY: `write_pos + count_ones(mask_value) <= data.len()` since we're compacting.
+        unsafe { _mm512_storeu_si512(data.as_mut_ptr().add(write_pos) as *mut __m512i, filtered) };
+
+        // Uses the hardware `popcnt` instruction if available.
+        let count = T::count_ones(mask_value);
+        write_pos += count;
+    }
+
+    // Handle the final partial chunk with simple scalar processing.
+    let read_pos = full_chunks * T::ELEMENTS_PER_VECTOR;
+    for i in 0..remainder {
+        let read_idx = read_pos + i;
+        let bit_idx = read_idx % 8;
+        let byte_idx = read_idx / 8;
+
+        if (mask[byte_idx] >> bit_idx) & 1 == 1 {
+            data[write_pos] = data[read_idx];
+            write_pos += 1;
+        }
+    }
+
+    write_pos
+}