vortex-data
diff --git a/‎vortex-compute/src/filter/avx512/in_place.rs‎
Lines changed: 309 additions & 0 deletions b/‎vortex-compute/src/filter/avx512/in_place.rs‎
Lines changed: 309 additions & 0 deletions
diff --git a/‎vortex-compute/src/filter/avx512/mod.rs‎
Lines changed: 13 additions & 0 deletions b/‎vortex-compute/src/filter/avx512/mod.rs‎
Lines changed: 13 additions & 0 deletions
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+use std::arch::x86_64::*;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::filter::avx512::SimdCompress;
+
+/// Filter a mutable slice of elements in-place depending on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask.
+///
+/// This function automatically dispatches to the most efficient implementation based on the
+/// available CPU features at compile time.
+///
+/// # Panics
+///
+/// Panics if `mask.len() != data.len().div_ceil(8)`.
+#[inline]
+pub fn filter_in_place<T: SimdCompress>(data: &mut [T], mask: &[u8]) -> usize {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        let use_simd = if T::WIDTH >= 32 {
+            // 32-bit and 64-bit types only need AVX-512F.
+            is_x86_feature_detected!("avx512f")
+        } else {
+            // 8-bit and 16-bit types need both AVX-512F and AVX-512VBMI2.
+            is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vbmi2")
+        };
+
+        if use_simd {
+            return unsafe { filter_in_place_avx512(data, mask) };
+        }
+    }
+
+    // Fall back to scalar implementation for non-x86 or when SIMD not available.
+    filter_in_place_scalar(data, mask)
+}
+
+/// Filter a mutable slice of elements in-place depending on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask.
+///
+/// This function uses a scalar implementation that simply uses a read and write pointer to write
+/// values to the correct places in memory.
+///
+/// # Panics
+///
+/// Panics if `mask.len() != data.len().div_ceil(8)`.
+#[inline]
+pub fn filter_in_place_scalar<T: Copy>(data: &mut [T], mask: &[u8]) -> usize {
+    assert_eq!(
+        mask.len(),
+        data.len().div_ceil(8),
+        "Mask length must be data.len().div_ceil(8)"
+    );
+
+    let mut write_pos = 0;
+    let data_len = data.len();
+
+    for read_pos in 0..data_len {
+        let byte_idx = read_pos / 8;
+        let bit_idx = read_pos % 8;
+
+        if (mask[byte_idx] >> bit_idx) & 1 == 1 {
+            data[write_pos] = data[read_pos];
+            write_pos += 1;
+        }
+    }
+
+    write_pos
+}
+
+/// Filter a mutable slice of elements in-place depending on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask.
+///
+/// This function uses AVX-512 SIMD instructions for high-performance filtering.
+///
+/// # Panics
+///
+/// Panics if `mask.len() != data.len().div_ceil(8)`.
+///
+/// # Safety
+///
+/// This function requires the appropriate SIMD instruction set to be available.
+/// For AVX-512F types, the CPU must support AVX-512F.
+/// For AVX-512VBMI2 types, the CPU must support AVX-512VBMI2.
+#[inline]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx512f,avx512vbmi2,popcnt")]
+pub unsafe fn filter_in_place_avx512<T: SimdCompress>(data: &mut [T], mask: &[u8]) -> usize {
+    assert_eq!(
+        mask.len(),
+        data.len().div_ceil(8),
+        "Mask length must be data.len().div_ceil(8)"
+    );
+
+    let data_len = data.len();
+    let mut write_pos = 0;
+
+    // Pre-calculate loop bounds to eliminate branch misprediction in the hot loop.
+    let full_chunks = data_len / T::ELEMENTS_PER_VECTOR;
+    let remainder = data_len % T::ELEMENTS_PER_VECTOR;
+
+    // Process full chunks with no branches in the loop.
+    for chunk_idx in 0..full_chunks {
+        let read_pos = chunk_idx * T::ELEMENTS_PER_VECTOR;
+        let mask_byte_offset = chunk_idx * T::MASK_BYTES;
+
+        // Read the mask for this chunk.
+        // SAFETY: `mask_byte_offset + T::MASK_BYTES <= mask.len()` for all full chunks.
+        let mask_value = unsafe { T::read_mask(mask.as_ptr(), mask_byte_offset) };
+
+        // Load elements into the SIMD register.
+        // SAFETY: `read_pos + T::ELEMENTS_PER_VECTOR <= data.len()` for all full chunks.
+        let vector = unsafe { _mm512_loadu_si512(data.as_ptr().add(read_pos) as *const __m512i) };
+
+        // Moves all elements that have their bit set to 1 in the mask value to the left.
+        let filtered = unsafe { T::compress_vector(mask_value, vector) };
+
+        // Write the filtered result vector back to memory.
+        // SAFETY: `write_pos + count_ones(mask_value) <= data.len()` since we're compacting.
+        unsafe { _mm512_storeu_si512(data.as_mut_ptr().add(write_pos) as *mut __m512i, filtered) };
+
+        // Uses the hardware `popcnt` instruction if available.
+        let count = T::count_ones(mask_value);
+        write_pos += count;
+    }
+
+    // Handle the final partial chunk with simple scalar processing.
+    let read_pos = full_chunks * T::ELEMENTS_PER_VECTOR;
+    for i in 0..remainder {
+        let read_idx = read_pos + i;
+        let bit_idx = read_idx % 8;
+        let byte_idx = read_idx / 8;
+
+        if (mask[byte_idx] >> bit_idx) & 1 == 1 {
+            data[write_pos] = data[read_idx];
+            write_pos += 1;
+        }
+    }
+
+    write_pos
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn create_mask(bits: &[bool]) -> Vec<u8> {
+        let mut mask = vec![0u8; bits.len().div_ceil(8)];
+        for (i, &bit) in bits.iter().enumerate() {
+            if bit {
+                mask[i / 8] |= 1 << (i % 8);
+            }
+        }
+        mask
+    }
+
+    fn test_implementation<F>(filter_fn: F)
+    where
+        F: Fn(&mut [i32], &[u8]) -> usize,
+    {
+        // Test 1: Small array - all elements pass
+        let mut data = vec![0, 1, 2, 3, 4, 5, 6, 7];
+        let mask = vec![0xFF]; // All 1s
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 8);
+        assert_eq!(&data[..8], &[0, 1, 2, 3, 4, 5, 6, 7]);
+
+        // Test 2: Small array - no elements pass
+        let mut data = vec![0, 1, 2, 3, 4, 5, 6, 7];
+        let mask = vec![0x00]; // All 0s
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 0);
+
+        // Test 3: Small array - every other element
+        let mut data = vec![0, 1, 2, 3, 4, 5, 6, 7];
+        let mask = vec![0x55]; // 01010101
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 4);
+        assert_eq!(&data[..4], &[0, 2, 4, 6]);
+
+        // Test 4: 16 elements - all pass
+        let mut data: Vec<i32> = (0..16).collect();
+        let mask = vec![0xFF, 0xFF];
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 16);
+        assert_eq!(
+            &data[..16],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        );
+
+        // Test 5: 16 elements - alternating pattern
+        let mut data: Vec<i32> = (0..16).collect();
+        let mask = vec![0xAA, 0xAA]; // 10101010 10101010
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 8);
+        assert_eq!(&data[..8], &[1, 3, 5, 7, 9, 11, 13, 15]);
+
+        // Test 6: Larger array (32 elements)
+        let mut data: Vec<i32> = (0..32).collect();
+        let mask = vec![0xFF, 0x00, 0xFF, 0x00]; // First and third bytes
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 16);
+        assert_eq!(
+            &data[..16],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+        );
+
+        // Test 7: Non-aligned size (23 elements)
+        let mut data: Vec<i32> = (0..23).collect();
+        let mask = create_mask(&[
+            true, false, true, false, true, false, true, false, // byte 0
+            false, true, false, true, false, true, false, true, // byte 1
+            true, true, false, false, true, true, false, // byte 2 (partial)
+        ]);
+        let count = filter_fn(&mut data, &mask);
+        assert_eq!(count, 12);
+        assert_eq!(&data[..12], &[0, 2, 4, 6, 9, 11, 13, 15, 16, 17, 20, 21]);
+    }
+
+    #[test]
+    fn test_scalar() {
+        test_implementation(filter_in_place_scalar::<i32>);
+    }
+
+    #[test]
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    fn test_avx512() {
+        test_implementation(|data, mask| unsafe { filter_in_place_avx512::<i32>(data, mask) });
+    }
+
+    #[test]
+    fn test_runtime_dispatch() {
+        test_implementation(filter_in_place::<i32>);
+    }
+
+    #[test]
+    fn test_all_implementations_match() {
+        // Test that all available implementations produce the same results
+
+        // Test various sizes and patterns
+        let test_cases = vec![
+            (8, vec![0xAA]),                    // 8 elements, alternating
+            (16, vec![0xFF, 0xFF]),             // 16 elements, all pass
+            (16, vec![0x00, 0x00]),             // 16 elements, none pass
+            (32, vec![0x55, 0x55, 0x55, 0x55]), // 32 elements, alternating
+            (24, vec![0xFF, 0x00, 0xFF]),       // 24 elements, mixed
+            (100, vec![0xFF; 13]),              // 100 elements (needs 13 bytes)
+        ];
+
+        for (size, mask) in test_cases {
+            let mut data_scalar: Vec<i32> = (0..size).collect();
+
+            let count_scalar = filter_in_place_scalar::<i32>(&mut data_scalar, &mask);
+
+            // Test AVX-512 on x86/x86_64
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            {
+                let mut data_avx512: Vec<i32> = (0..size).collect();
+                let count_avx512 =
+                    unsafe { filter_in_place_avx512::<i32>(&mut data_avx512, &mask) };
+                assert_eq!(
+                    count_scalar, count_avx512,
+                    "Count mismatch for size {}",
+                    size
+                );
+                assert_eq!(
+                    &data_scalar[..count_scalar],
+                    &data_avx512[..count_avx512],
+                    "Data mismatch for size {}",
+                    size
+                );
+            }
+        }
+    }
+
+    #[expect(clippy::cast_possible_truncation)]
+    #[test]
+    fn test_large_arrays() {
+        // Test with very large arrays to ensure chunking works correctly
+        let sizes: Vec<usize> = vec![1024, 1000, 2048, 4096, 10000];
+
+        for size in sizes {
+            let mut data: Vec<i32> = (0..size as i32).collect();
+            // Create alternating mask
+            let mut mask = vec![0u8; size.div_ceil(8)];
+            mask.fill(0x55); // 01010101
+
+            let count = filter_in_place::<i32>(&mut data, &mask);
+            assert_eq!(count, size / 2);
+
+            // Verify first few and last few elements
+            (0..10.min(count)).for_each(|i| {
+                assert_eq!(data[i], (i * 2) as i32);
+            });
+        }
+    }
+}
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+// TODO(connor): Refactor this module and add filter not in place + up front scalar fallback.
+
+#![expect(unused)] // TODO(connor): Remove
+#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+
+mod in_place;
+pub use in_place::*;
+
+mod simd_compress;
+pub use simd_compress::SimdCompress;