check for OOB in avx2 take impl

connortsui20 · connortsui20 · commit 2b21c628a126 · 2025-12-12T17:59:13.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/vortex-compute/src/take/slice/avx2.rs b/vortex-compute/src/take/slice/avx2.rs
@@ -10,6 +10,7 @@
 
 use std::arch::x86_64::__m256i;
 use std::arch::x86_64::_mm_loadu_si128;
+use std::arch::x86_64::_mm_movemask_epi8;
 use std::arch::x86_64::_mm_setzero_si128;
 use std::arch::x86_64::_mm_shuffle_epi32;
 use std::arch::x86_64::_mm_storeu_si128;
@@ -26,6 +27,7 @@ use std::arch::x86_64::_mm256_loadu_si256;
 use std::arch::x86_64::_mm256_mask_i32gather_epi32;
 use std::arch::x86_64::_mm256_mask_i64gather_epi32;
 use std::arch::x86_64::_mm256_mask_i64gather_epi64;
+use std::arch::x86_64::_mm256_movemask_epi8;
 use std::arch::x86_64::_mm256_set1_epi32;
 use std::arch::x86_64::_mm256_set1_epi64x;
 use std::arch::x86_64::_mm256_setzero_si256;
@@ -102,56 +104,75 @@ pub(crate) trait GatherFn<Idx, Values> {
     /// Gather values from `src` into the `dst` using the `indices`, optionally using
     /// SIMD instructions.
     ///
+    /// Returns `true` if all indices in this batch were valid (less than `max_idx`), `false`
+    /// otherwise. Invalid indices are masked out during the gather (substituting zeros).
+    ///
     /// # Safety
     ///
     /// This function can read up to `STRIDE` elements through `indices`, and read/write up to
     /// `WIDTH` elements through `src` and `dst` respectively.
-    unsafe fn gather(indices: *const Idx, max_idx: Idx, src: *const Values, dst: *mut Values);
+    unsafe fn gather(
+        indices: *const Idx,
+        max_idx: Idx,
+        src: *const Values,
+        dst: *mut Values,
+    ) -> bool;
 }
 
 /// AVX2 version of GatherFn defined for 32- and 64-bit value types.
 enum AVX2Gather {}
 
 macro_rules! impl_gather {
-    ($idx:ty, $({$value:ty => load: $load:ident, extend: $extend:ident, splat: $splat:ident, zero_vec: $zero_vec:ident, mask_indices: $mask_indices:ident, mask_cvt: |$mask_var:ident| $mask_cvt:block, gather: $masked_gather:ident, store: $store:ident, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal }),+) => {
+    ($idx:ty, $({$value:ty => load: $load:ident, extend: $extend:ident, splat: $splat:ident, zero_vec: $zero_vec:ident, mask_indices: $mask_indices:ident, mask_cvt: |$mask_var:ident| $mask_cvt:block, movemask: $movemask:ident, all_valid_mask: $all_valid_mask:expr, gather: $masked_gather:ident, store: $store:ident, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal }),+) => {
         $(
-            impl_gather!(single; $idx, $value, load: $load, extend: $extend, splat: $splat, zero_vec: $zero_vec, mask_indices: $mask_indices, mask_cvt: |$mask_var| $mask_cvt, gather: $masked_gather, store: $store, WIDTH = $WIDTH, STRIDE = $STRIDE);
+            impl_gather!(single; $idx, $value, load: $load, extend: $extend, splat: $splat, zero_vec: $zero_vec, mask_indices: $mask_indices, mask_cvt: |$mask_var| $mask_cvt, movemask: $movemask, all_valid_mask: $all_valid_mask, gather: $masked_gather, store: $store, WIDTH = $WIDTH, STRIDE = $STRIDE);
         )*
     };
-    (single; $idx:ty, $value:ty, load: $load:ident, extend: $extend:ident, splat: $splat:ident, zero_vec: $zero_vec:ident, mask_indices: $mask_indices:ident, mask_cvt: |$mask_var:ident| $mask_cvt:block, gather: $masked_gather:ident, store: $store:ident, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => {
+    (single; $idx:ty, $value:ty, load: $load:ident, extend: $extend:ident, splat: $splat:ident, zero_vec: $zero_vec:ident, mask_indices: $mask_indices:ident, mask_cvt: |$mask_var:ident| $mask_cvt:block, movemask: $movemask:ident, all_valid_mask: $all_valid_mask:expr, gather: $masked_gather:ident, store: $store:ident, WIDTH = $WIDTH:literal, STRIDE = $STRIDE:literal) => {
             impl GatherFn<$idx, $value> for AVX2Gather {
                 const WIDTH: usize = $WIDTH;
                 const STRIDE: usize = $STRIDE;
 
                 #[allow(unused_unsafe, clippy::cast_possible_truncation)]
                 #[inline(always)]
-                unsafe fn gather(indices: *const $idx, max_idx: $idx, src: *const $value, dst: *mut $value) {
+                unsafe fn gather(
+                    indices: *const $idx,
+                    max_idx: $idx,
+                    src: *const $value,
+                    dst: *mut $value
+                ) -> bool {
                     const {
                         assert!($WIDTH <= $STRIDE, "dst cannot advance by more than the stride");
                     }
 
                     const SCALE: i32 = std::mem::size_of::<$value>() as i32;
 
                     let indices_vec = unsafe { $load(indices.cast()) };
-                    // Extend indices to fill vector register
+                    // Extend indices to fill vector register.
                     let indices_vec = unsafe { $extend(indices_vec) };
 
-                    // create a vec of the max idx
+                    // Create a vec of the max idx.
                     let max_idx_vec = unsafe { $splat(max_idx as _) };
-                    // create a mask for valid indices (where the max_idx > provided index).
-                    let invalid_mask = unsafe { $mask_indices(max_idx_vec, indices_vec) };
-                    let invalid_mask = {
-                        let $mask_var = invalid_mask;
+                    // Create a mask for valid indices (where the max_idx > provided index).
+                    let valid_mask = unsafe { $mask_indices(max_idx_vec, indices_vec) };
+                    let valid_mask = {
+                        let $mask_var = valid_mask;
                         $mask_cvt
                     };
                     let zero_vec = unsafe { $zero_vec() };
 
                     // Gather the values into new vector register, for masked positions
                     // it substitutes zero instead of accessing the src.
-                    let values_vec = unsafe { $masked_gather::<SCALE>(zero_vec, src.cast(), indices_vec, invalid_mask) };
+                    let values_vec = unsafe {
+                        $masked_gather::<SCALE>(zero_vec, src.cast(), indices_vec, valid_mask)
+                    };
 
                     // Write the vec out to dst.
                     unsafe { $store(dst.cast(), values_vec) };
+
+                    // Return true if all indices were valid (all mask bits set).
+                    let mask_bits = unsafe { $movemask(valid_mask) };
+                    mask_bits == $all_valid_mask
                 }
             }
     };
@@ -167,6 +188,8 @@ impl_gather!(u8,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi32,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i32gather_epi32,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 16
@@ -179,6 +202,8 @@ impl_gather!(u8,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi64,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 16
@@ -195,6 +220,8 @@ impl_gather!(u16,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi32,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i32gather_epi32,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 8
@@ -207,6 +234,8 @@ impl_gather!(u16,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi64,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 8
@@ -223,6 +252,8 @@ impl_gather!(u32,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi32,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i32gather_epi32,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 8
@@ -235,6 +266,8 @@ impl_gather!(u32,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi64,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 4
@@ -259,6 +292,8 @@ impl_gather!(u64,
                 _mm_unpacklo_epi64(lo_packed, hi_packed)
             }
         },
+        movemask: _mm_movemask_epi8,
+        all_valid_mask: 0xFFFF_i32,
         gather: _mm256_mask_i64gather_epi32,
         store: _mm_storeu_si128,
         WIDTH = 4, STRIDE = 4
@@ -271,6 +306,8 @@ impl_gather!(u64,
         zero_vec: _mm256_setzero_si256,
         mask_indices: _mm256_cmpgt_epi64,
         mask_cvt: |x| { x },
+        movemask: _mm256_movemask_epi8,
+        all_valid_mask: -1_i32,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 4
@@ -292,25 +329,32 @@ where
     let buf_uninit = buffer.spare_capacity_mut();
 
     let mut offset = 0;
+    let mut all_valid = true;
+
     // Loop terminates STRIDE elements before end of the indices array because the GatherFn
     // might read up to STRIDE src elements at a time, even though it only advances WIDTH elements
     // in the dst.
     while offset + Gather::STRIDE < indices_len {
         // SAFETY: gather_simd preconditions satisfied:
         //  1. `(indices + offset)..(indices + offset + STRIDE)` is in-bounds for indices allocation
         //  2. `buffer` has same len as indices so `buffer + offset + STRIDE` is always valid.
-        unsafe {
+        let batch_valid = unsafe {
             Gather::gather(
                 indices.as_ptr().add(offset),
                 max_index,
                 values.as_ptr(),
                 buf_uninit.as_mut_ptr().add(offset).cast(),
             )
         };
+        all_valid &= batch_valid;
         offset += Gather::WIDTH;
     }
 
-    // Remainder
+    // Check accumulated validity after hot loop. If there are any 0's, then there was an
+    // out-of-bounds index.
+    assert!(all_valid, "index out of bounds in AVX2 take");
+
+    // Fall back to scalar iteration for the remainder.
     while offset < indices_len {
         buf_uninit[offset].write(values[indices[offset].as_()]);
         offset += 1;