add avx2 take impl back and bound by Copy

connortsui20 · connortsui20 · commit 01ccc6efb231 · 2025-12-12T17:21:40.000-05:00
Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/vortex-compute/src/take/slice/avx2.rs b/vortex-compute/src/take/slice/avx2.rs
@@ -31,87 +31,62 @@ use std::arch::x86_64::_mm256_set1_epi64x;
 use std::arch::x86_64::_mm256_setzero_si256;
 use std::arch::x86_64::_mm256_storeu_si256;
 use std::convert::identity;
+use std::mem::size_of;
 
 use vortex_buffer::Alignment;
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
-use vortex_dtype::NativePType;
-use vortex_dtype::PType;
 use vortex_dtype::UnsignedPType;
+use vortex_dtype::match_each_unsigned_integer_ptype;
 
 use crate::take::slice::take_scalar;
 
 /// Takes the specified indices into a new [`Buffer`] using AVX2 SIMD.
 ///
-/// This returns None if the AVX2 feature is not detected at runtime, signalling to the caller
-/// that it should fall back to the scalar implementation.
-///
-/// If AVX2 is available, this returns a PrimitiveArray containing the result of the take operation
-/// accelerated using AVX2 instructions.
+/// This function handles the type matching required to satisfy AVX2 gather instruction requirements
+/// by casting to unsigned integers of the same size. Falls back to scalar implementation for
+/// unsupported type sizes.
 ///
 /// # Panics
 ///
-/// This function panics if any of the provided `indices` are out of bounds for `values`
+/// This function panics if any of the provided `indices` are out of bounds for `values`.
 ///
 /// # Safety
 ///
 /// The caller must ensure the `avx2` feature is enabled.
-#[allow(dead_code, unused_variables, reason = "TODO(connor): Implement this")]
 #[target_feature(enable = "avx2")]
 #[inline]
-pub unsafe fn take_avx2<V: NativePType, I: UnsignedPType>(
-    buffer: &[V],
-    indices: &[I],
-) -> Buffer<V> {
-    macro_rules! dispatch_avx2 {
-        ($indices:ty, $values:ty) => {
-            { let result = dispatch_avx2!($indices, $values, cast: $values); result }
-        };
-        ($indices:ty, $values:ty, cast: $cast:ty) => {{
-            let indices = unsafe { std::mem::transmute::<&[I], &[$indices]>(indices) };
-            let values = unsafe { std::mem::transmute::<&[V], &[$cast]>(buffer) };
-
-            let result = exec_take::<$cast, $indices, AVX2Gather>(values, indices);
-            result.cast_into::<V>()
-        }};
-    }
-
-    match (I::PTYPE, V::PTYPE) {
-        // Int value types. Only 32 and 64 bit types are supported.
-        (PType::U8, PType::I32) => dispatch_avx2!(u8, i32),
-        (PType::U8, PType::U32) => dispatch_avx2!(u8, u32),
-        (PType::U8, PType::I64) => dispatch_avx2!(u8, i64),
-        (PType::U8, PType::U64) => dispatch_avx2!(u8, u64),
-        (PType::U16, PType::I32) => dispatch_avx2!(u16, i32),
-        (PType::U16, PType::U32) => dispatch_avx2!(u16, u32),
-        (PType::U16, PType::I64) => dispatch_avx2!(u16, i64),
-        (PType::U16, PType::U64) => dispatch_avx2!(u16, u64),
-        (PType::U32, PType::I32) => dispatch_avx2!(u32, i32),
-        (PType::U32, PType::U32) => dispatch_avx2!(u32, u32),
-        (PType::U32, PType::I64) => dispatch_avx2!(u32, i64),
-        (PType::U32, PType::U64) => dispatch_avx2!(u32, u64),
-
-        // Float value types, treat them as if they were corresponding int types.
-        (PType::U8, PType::F32) => dispatch_avx2!(u8, f32, cast: u32),
-        (PType::U16, PType::F32) => dispatch_avx2!(u16, f32, cast: u32),
-        (PType::U32, PType::F32) => dispatch_avx2!(u32, f32, cast: u32),
-        (PType::U64, PType::F32) => dispatch_avx2!(u64, f32, cast: u32),
-
-        (PType::U8, PType::F64) => dispatch_avx2!(u8, f64, cast: u64),
-        (PType::U16, PType::F64) => dispatch_avx2!(u16, f64, cast: u64),
-        (PType::U32, PType::F64) => dispatch_avx2!(u32, f64, cast: u64),
-        (PType::U64, PType::F64) => dispatch_avx2!(u64, f64, cast: u64),
-
-        // Scalar fallback for unsupported value types.
-        _ => {
-            tracing::trace!(
-                "take AVX2 kernel missing for indices {} values {}, falling back to scalar",
-                I::PTYPE,
-                V::PTYPE
-            );
-
-            take_scalar(buffer, indices)
+pub unsafe fn take_avx2<V: Copy, I: UnsignedPType>(buffer: &[V], indices: &[I]) -> Buffer<V> {
+    // AVX2 gather operations only care about bit patterns, not semantic type. We cast to unsigned
+    // integers which have the required gather implementations and then cast back.
+    //
+    // SAFETY: The pointer casts below are safe because:
+    // - `V` and the target type have the same size (matched by `size_of::<V>()`)
+    // - The alignment of unsigned integers is always <= their size, and `buffer` came from a valid
+    //   `&[V]` which guarantees proper alignment for types of the same size.
+    match size_of::<V>() {
+        4 => {
+            let values: &[u32] =
+                unsafe { std::slice::from_raw_parts(buffer.as_ptr().cast::<u32>(), buffer.len()) };
+            match_each_unsigned_integer_ptype!(I::PTYPE, |IC| {
+                let indices: &[IC] = unsafe {
+                    std::slice::from_raw_parts(indices.as_ptr().cast::<IC>(), indices.len())
+                };
+                exec_take::<u32, IC, AVX2Gather>(values, indices).cast_into::<V>()
+            })
+        }
+        8 => {
+            let values: &[u64] =
+                unsafe { std::slice::from_raw_parts(buffer.as_ptr().cast::<u64>(), buffer.len()) };
+            match_each_unsigned_integer_ptype!(I::PTYPE, |IC| {
+                let indices: &[IC] = unsafe {
+                    std::slice::from_raw_parts(indices.as_ptr().cast::<IC>(), indices.len())
+                };
+                exec_take::<u64, IC, AVX2Gather>(values, indices).cast_into::<V>()
+            })
         }
+        // Fall back to scalar implementation for unsupported type sizes (1, 2 byte types).
+        _ => take_scalar(buffer, indices),
     }
 }
 
@@ -182,9 +157,9 @@ macro_rules! impl_gather {
     };
 }
 
-// kernels for u8 indices
+// Kernels for u8 indices.
 impl_gather!(u8,
-    // 32-bit values, loaded 8 at a time
+    // 32-bit values, loaded 8 at a time.
     { u32 =>
         load: _mm_loadu_si128,
         extend: _mm256_cvtepu8_epi32,
@@ -196,19 +171,7 @@ impl_gather!(u8,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 16
     },
-    { i32 =>
-        load: _mm_loadu_si128,
-        extend: _mm256_cvtepu8_epi32,
-        splat: _mm256_set1_epi32,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi32,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i32gather_epi32,
-        store: _mm256_storeu_si256,
-        WIDTH = 8, STRIDE = 16
-    },
-
-    // 64-bit values, loaded 4 at a time
+    // 64-bit values, loaded 4 at a time.
     { u64 =>
         load: _mm_loadu_si128,
         extend: _mm256_cvtepu8_epi64,
@@ -219,23 +182,12 @@ impl_gather!(u8,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 16
-    },
-    { i64 =>
-        load: _mm_loadu_si128,
-        extend: _mm256_cvtepu8_epi64,
-        splat: _mm256_set1_epi64x,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi64,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i64gather_epi64,
-        store: _mm256_storeu_si256,
-        WIDTH = 4, STRIDE = 16
     }
 );
 
-// kernels for u16 indices
+// Kernels for u16 indices.
 impl_gather!(u16,
-    // 32-bit values. 8x indices loaded at a time and 8x values written at a time
+    // 32-bit values. 8x indices loaded at a time and 8x values written at a time.
     { u32 =>
         load: _mm_loadu_si128,
         extend: _mm256_cvtepu16_epi32,
@@ -247,18 +199,6 @@ impl_gather!(u16,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 8
     },
-    { i32 =>
-        load: _mm_loadu_si128,
-        extend: _mm256_cvtepu16_epi32,
-        splat: _mm256_set1_epi32,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi32,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i32gather_epi32,
-        store: _mm256_storeu_si256,
-        WIDTH = 8, STRIDE = 8
-    },
-
     // 64-bit values. 8x indices loaded at a time and 4x values loaded at a time.
     { u64 =>
         load: _mm_loadu_si128,
@@ -270,23 +210,12 @@ impl_gather!(u16,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 8
-    },
-    { i64 =>
-        load: _mm_loadu_si128,
-        extend: _mm256_cvtepu16_epi64,
-        splat: _mm256_set1_epi64x,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi64,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i64gather_epi64,
-        store: _mm256_storeu_si256,
-        WIDTH = 4, STRIDE = 8
     }
 );
 
-// kernels for u32 indices
+// Kernels for u32 indices.
 impl_gather!(u32,
-    // 32-bit values. 8x indices loaded at a time and 8x values written
+    // 32-bit values. 8x indices loaded at a time and 8x values written.
     { u32 =>
         load: _mm256_loadu_si256,
         extend: identity,
@@ -298,19 +227,7 @@ impl_gather!(u32,
         store: _mm256_storeu_si256,
         WIDTH = 8, STRIDE = 8
     },
-    { i32 =>
-        load: _mm256_loadu_si256,
-        extend: identity,
-        splat: _mm256_set1_epi32,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi32,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i32gather_epi32,
-        store: _mm256_storeu_si256,
-        WIDTH = 8, STRIDE = 8
-    },
-
-    // 64-bit values
+    // 64-bit values.
     { u64 =>
         load: _mm_loadu_si128,
         extend: _mm256_cvtepu32_epi64,
@@ -321,22 +238,12 @@ impl_gather!(u32,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 4
-    },
-    { i64 =>
-        load: _mm_loadu_si128,
-        extend: _mm256_cvtepu32_epi64,
-        splat: _mm256_set1_epi64x,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi64,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i64gather_epi64,
-        store: _mm256_storeu_si256,
-        WIDTH = 4, STRIDE = 4
     }
 );
 
-// kernels for u64 indices
+// Kernels for u64 indices.
 impl_gather!(u64,
+    // 32-bit values.
     { u32 =>
         load: _mm256_loadu_si256,
         extend: identity,
@@ -356,27 +263,7 @@ impl_gather!(u64,
         store: _mm_storeu_si128,
         WIDTH = 4, STRIDE = 4
     },
-    { i32 =>
-        load: _mm256_loadu_si256,
-        extend: identity,
-        splat: _mm256_set1_epi64x,
-        zero_vec: _mm_setzero_si128,
-        mask_indices: _mm256_cmpgt_epi64,
-        mask_cvt: |m| {
-            unsafe {
-                let lo_bits = _mm256_extracti128_si256::<0>(m);    // lower half
-                let hi_bits = _mm256_extracti128_si256::<1>(m);    // upper half
-                let lo_packed = _mm_shuffle_epi32::<0b01_01_01_01>(lo_bits);
-                let hi_packed = _mm_shuffle_epi32::<0b01_01_01_01>(hi_bits);
-                _mm_unpacklo_epi64(lo_packed, hi_packed)
-            }
-        },
-        gather: _mm256_mask_i64gather_epi32,
-        store: _mm_storeu_si128,
-        WIDTH = 4, STRIDE = 4
-    },
-
-    // 64-bit values
+    // 64-bit values.
     { u64 =>
         load: _mm256_loadu_si256,
         extend: identity,
@@ -387,17 +274,6 @@ impl_gather!(u64,
         gather: _mm256_mask_i64gather_epi64,
         store: _mm256_storeu_si256,
         WIDTH = 4, STRIDE = 4
-    },
-    { i64 =>
-        load: _mm256_loadu_si256,
-        extend: identity,
-        splat: _mm256_set1_epi64x,
-        zero_vec: _mm256_setzero_si256,
-        mask_indices: _mm256_cmpgt_epi64,
-        mask_cvt: |x| { x },
-        gather: _mm256_mask_i64gather_epi64,
-        store: _mm256_storeu_si256,
-        WIDTH = 4, STRIDE = 4
     }
 );
 
diff --git a/vortex-compute/src/take/slice/mod.rs b/vortex-compute/src/take/slice/mod.rs
@@ -21,10 +21,6 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] {
             return portable::take_portable(self, indices);
         }
 
-        // TODO(connor): Make the SIMD implementations bound by `Copy` instead of `NativePType`.
-
-        /*
-
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         {
             if is_x86_feature_detected!("avx2") {
@@ -33,8 +29,6 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] {
             }
         }
 
-        */
-
         #[allow(unreachable_code, reason = "`vortex_nightly` path returns early")]
         take_scalar(self, indices)
     }

Original file line number	Diff line number	Diff line change
`@@ -21,10 +21,6 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] {`
`21`	`21`	`return portable::take_portable(self, indices);`
`22`	`22`	`}`
`23`	`23`
`24`		- // TODO(connor): Make the SIMD implementations bound by `Copy` instead of `NativePType`.
`25`		`-`
`26`		`- /*`
`27`		`-`
`28`	`24`	`#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]`
`29`	`25`	`{`
`30`	`26`	`if is_x86_feature_detected!("avx2") {`
`@@ -33,8 +29,6 @@ impl<T: Copy, I: UnsignedPType> Take<[I]> for &[T] {`
`33`	`29`	`}`
`34`	`30`	`}`
`35`	`31`
`36`		`- */`
`37`		`-`
`38`	`32`	#[allow(unreachable_code, reason = "`vortex_nightly` path returns early")]
`39`	`33`	`take_scalar(self, indices)`
`40`	`34`	`}`