add AVX512 support for filtering out of place

connortsui20 · connortsui20 · commit c5162cc8ee70 · 2025-11-24T09:53:22.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/vortex-compute/benches/avx512.rs b/vortex-compute/benches/avx512.rs
@@ -8,6 +8,9 @@ use rand::Rng;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 use vortex_compute::filter::slice::in_place::avx512::filter_in_place_avx512;
 use vortex_compute::filter::slice::in_place::filter_in_place_scalar;
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use vortex_compute::filter::slice::out::avx512::filter_into_avx512;
+use vortex_compute::filter::slice::out::filter_into_scalar;
 
 fn main() {
     divan::main();
@@ -32,46 +35,53 @@ fn create_random_mask(size: usize, probability: f64) -> Vec<u8> {
     mask
 }
 
-// Benchmark different data sizes.
-const SIZES: &[usize] = &[1 << 10, 1 << 14, 1 << 17];
+/// Benchmark different data sizes.
+const SIZES: &[usize] = &[1 << 10, 1 << 11, 1 << 14, 1 << 17];
 
-// Different probability values to benchmark.
+/// Different probability values to benchmark.
 const PROBABILITIES: &[f64] = &[0.0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0];
 
-#[divan::bench(sample_size = 64, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
-fn random_probability_scalar(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+/// The number of samples per benchmark.
+const SAMPLE_SIZE: u32 = 64;
+
+#[divan::bench(sample_size = SAMPLE_SIZE, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+fn in_place_scalar(bencher: divan::Bencher, (size, probability): (usize, f64)) {
     let mask = create_random_mask(size, probability);
     bencher
         .with_inputs(|| (0..size as i32).collect::<Vec<_>>())
         .bench_values(|mut data| filter_in_place_scalar(&mut data, &mask))
 }
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[divan::bench(sample_size = 64, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
-fn random_probability_avx512(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+#[divan::bench(sample_size = SAMPLE_SIZE, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+fn in_place_avx512(bencher: divan::Bencher, (size, probability): (usize, f64)) {
     let mask = create_random_mask(size, probability);
     bencher
         .with_inputs(|| (0..size as i32).collect::<Vec<_>>())
         .bench_values(|mut data| unsafe { filter_in_place_avx512(&mut data, &mask) })
 }
 
-const LARGE_SIZE: usize = 1024 * 1024; // 4 MB
-
-#[divan::bench(sample_size = 16, args = PROBABILITIES)]
-fn scalar_throughput(bencher: divan::Bencher, probability: f64) {
-    let mask = create_random_mask(LARGE_SIZE, probability);
+#[divan::bench(sample_size = SAMPLE_SIZE, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+fn out_scalar(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+    let mask = create_random_mask(size, probability);
     bencher
-        .counter(divan::counter::BytesCount::new(LARGE_SIZE * 4))
-        .with_inputs(|| (0..LARGE_SIZE as i32).collect::<Vec<_>>())
-        .bench_values(|mut data| filter_in_place_scalar(&mut data, &mask))
+        .with_inputs(|| {
+            let src = (0..size as i32).collect::<Vec<_>>();
+            let dest = vec![0i32; size];
+            (src, dest)
+        })
+        .bench_values(|(src, mut dest)| filter_into_scalar(&src, &mut dest, &mask))
 }
 
-#[divan::bench(sample_size = 16, args = PROBABILITIES)]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn avx512_throughput(bencher: divan::Bencher, probability: f64) {
-    let mask = create_random_mask(LARGE_SIZE, probability);
+#[divan::bench(sample_size = SAMPLE_SIZE, args = SIZES.iter().copied().cartesian_product(PROBABILITIES.iter().copied()))]
+fn out_avx512(bencher: divan::Bencher, (size, probability): (usize, f64)) {
+    let mask = create_random_mask(size, probability);
     bencher
-        .counter(divan::counter::BytesCount::new(LARGE_SIZE * 4))
-        .with_inputs(|| (0..LARGE_SIZE as i32).collect::<Vec<_>>())
-        .bench_values(|mut data| unsafe { filter_in_place_avx512(&mut data, &mask) })
+        .with_inputs(|| {
+            let src = (0..size as i32).collect::<Vec<_>>();
+            let dest = vec![0i32; size];
+            (src, dest)
+        })
+        .bench_values(|(src, mut dest)| unsafe { filter_into_avx512(&src, &mut dest, &mask) })
 }
diff --git a/vortex-compute/src/filter/slice/in_place/avx512.rs b/vortex-compute/src/filter/slice/in_place/avx512.rs
@@ -14,7 +14,7 @@ use crate::filter::slice::in_place::filter_in_place_scalar;
 ///
 /// The mask is represented as a slice of bytes (LSB is the first element).
 ///
-/// Returns the true count of the mask.
+/// Returns the true count of the mask (number of elements remaining).
 ///
 /// This function automatically dispatches to the most efficient implementation based on the
 /// available CPU features at compile time.
diff --git a/vortex-compute/src/filter/slice/in_place/mod.rs b/vortex-compute/src/filter/slice/in_place/mod.rs
@@ -6,6 +6,7 @@
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 pub mod avx512;
 
+// TODO(connor): This is super inefficient.
 /// Filter a mutable slice of elements in-place depending on the given mask.
 ///
 /// The mask is represented as a slice of bytes (LSB is the first element).
diff --git a/vortex-compute/src/filter/slice/out/avx512.rs b/vortex-compute/src/filter/slice/out/avx512.rs
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Implementations of a specialized out-of-place filter for buffers using AVX512.
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use std::arch::x86_64::*;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use crate::filter::slice::SimdCompress;
+use crate::filter::slice::out::filter_into_scalar;
+
+/// Filter elements from a source slice into a destination slice based on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask (number of elements written to destination).
+///
+/// This function automatically dispatches to the most efficient implementation based on the
+/// available CPU features at compile time.
+///
+/// # Panics
+///
+/// Panics if:
+///
+/// - `mask.len() != src.len().div_ceil(8)`
+/// - `dest.len() < src.len()`
+#[inline]
+pub fn filter_into<T: SimdCompress>(src: &[T], dest: &mut [T], mask: &[u8]) -> usize {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        let use_simd = if T::WIDTH >= 32 {
+            // 32-bit and 64-bit types only need AVX-512F.
+            is_x86_feature_detected!("avx512f")
+        } else {
+            // 8-bit and 16-bit types need both AVX-512F and AVX-512VBMI2.
+            is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vbmi2")
+        };
+
+        if use_simd {
+            return unsafe { filter_into_avx512(src, dest, mask) };
+        }
+    }
+
+    // Fall back to scalar implementation for non-x86 or when SIMD not available.
+    filter_into_scalar(src, dest, mask)
+}
+
+/// Filter elements from a source slice into a destination slice based on the given mask.
+///
+/// The mask is represented as a slice of bytes (LSB is the first element).
+///
+/// Returns the true count of the mask (number of elements written to destination).
+///
+/// This function uses AVX-512 SIMD instructions for high-performance filtering.
+///
+/// # Panics
+///
+/// Panics if:
+///
+/// - `mask.len() != src.len().div_ceil(8)`
+/// - `dest.len() < src.len()`
+///
+/// # Safety
+///
+/// This function requires the appropriate SIMD instruction set to be available.
+/// For AVX-512F types, the CPU must support AVX-512F.
+/// For AVX-512VBMI2 types, the CPU must support AVX-512VBMI2.
+#[inline]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx512f,avx512vbmi2,popcnt")]
+pub unsafe fn filter_into_avx512<T: SimdCompress>(src: &[T], dest: &mut [T], mask: &[u8]) -> usize {
+    assert_eq!(
+        mask.len(),
+        src.len().div_ceil(8),
+        "Mask length must be src.len().div_ceil(8)"
+    );
+    assert!(
+        dest.len() >= src.len(),
+        "Destination buffer must be at least as large as source"
+    );
+
+    let src_len = src.len();
+    let mut write_pos = 0;
+
+    // Pre-calculate loop bounds to eliminate branch misprediction in the hot loop.
+    let full_chunks = src_len / T::ELEMENTS_PER_VECTOR;
+    let remainder = src_len % T::ELEMENTS_PER_VECTOR;
+
+    // Process full chunks with no branches in the loop.
+    for chunk_idx in 0..full_chunks {
+        let read_pos = chunk_idx * T::ELEMENTS_PER_VECTOR;
+        let mask_byte_offset = chunk_idx * T::MASK_BYTES;
+
+        // Read the mask for this chunk.
+        // SAFETY: `mask_byte_offset + T::MASK_BYTES <= mask.len()` for all full chunks.
+        let mask_value = unsafe { T::read_mask(mask.as_ptr(), mask_byte_offset) };
+
+        // Load elements from source into the SIMD register.
+        // SAFETY: `read_pos + T::ELEMENTS_PER_VECTOR <= src.len()` for all full chunks.
+        let vector = unsafe { _mm512_loadu_si512(src.as_ptr().add(read_pos) as *const __m512i) };
+
+        // Moves all elements that have their bit set to 1 in the mask value to the left.
+        let filtered = unsafe { T::compress_vector(mask_value, vector) };
+
+        // Write the filtered result vector to destination buffer.
+        // SAFETY: `write_pos + count_ones(mask_value) <= dest.len()` since dest.len() >= src.len()
+        // and we're only writing the selected elements.
+        unsafe { _mm512_storeu_si512(dest.as_mut_ptr().add(write_pos) as *mut __m512i, filtered) };
+
+        // Uses the hardware `popcnt` instruction if available.
+        let count = T::count_ones(mask_value);
+        write_pos += count;
+    }
+
+    // Handle the final partial chunk with simple scalar processing.
+    let read_pos = full_chunks * T::ELEMENTS_PER_VECTOR;
+    for i in 0..remainder {
+        let read_idx = read_pos + i;
+        let bit_idx = read_idx % 8;
+        let byte_idx = read_idx / 8;
+
+        if (mask[byte_idx] >> bit_idx) & 1 == 1 {
+            dest[write_pos] = src[read_idx];
+            write_pos += 1;
+        }
+    }
+
+    write_pos
+}
diff --git a/vortex-compute/src/filter/slice/out/mod.rs b/vortex-compute/src/filter/slice/out/mod.rs