touch ups

a10y · a10y · commit adfc12687b5d · 2025-06-19T19:47:43.000-04:00
Signed-off-by: Andrew Duffy &lt;andrew@a10y.dev&gt;
diff --git a/vortex-array/src/arrays/primitive/compute/take/x86.rs b/vortex-array/src/arrays/primitive/compute/take/x86.rs
@@ -5,7 +5,7 @@ use std::arch::x86_64::*;
 
 use num_traits::AsPrimitive;
 use vortex_buffer::{Alignment, Buffer, BufferMut};
-use vortex_dtype::{NativePType, Nullability};
+use vortex_dtype::{NativePType, Nullability, PType};
 
 use crate::arrays::primitive::PrimitiveArray;
 
@@ -20,7 +20,13 @@ pub fn is_avx2_available() -> bool {
     false
 }
 
-/// AVX2-optimized take operation dispatch
+/// AVX2-optimized take operation dispatch.
+///
+/// This returns None if the AVX2 feature is not detected at runtime, signalling to the caller
+/// that it should fall back to the scalar implementation.
+///
+/// If AVX2 is available, this returns a PrimitiveArray containing the result of the take operation
+/// accelerated using AVX2 instructions.
 #[cfg(target_arch = "x86_64")]
 pub fn take_primitive_avx2<I, V>(
     indices: &[I],
@@ -36,42 +42,42 @@ where
     }
 
     // Dispatch to type-specific implementations
-    match (std::any::TypeId::of::<I>(), std::any::TypeId::of::<V>()) {
+    match (I::PTYPE, V::PTYPE) {
         // u32 indices, i32 values
-        (i, v) if i == std::any::TypeId::of::<u32>() && v == std::any::TypeId::of::<i32>() => {
+        (PType::U32, PType::I32) => {
             let indices = unsafe { std::mem::transmute::<&[I], &[u32]>(indices) };
             let values = unsafe { std::mem::transmute::<&[V], &[i32]>(values) };
-            let result = unsafe { take_i32_u32_avx2(indices, values) };
+            let result = unsafe { take_u32_i32_avx2(indices, values) };
             Some(PrimitiveArray::new(
                 unsafe { std::mem::transmute::<Buffer<i32>, Buffer<V>>(result) },
                 nullability.into(),
             ))
         }
         // u32 indices, f32 values
-        (i, v) if i == std::any::TypeId::of::<u32>() && v == std::any::TypeId::of::<f32>() => {
+        (PType::U32, PType::F32) => {
             let indices = unsafe { std::mem::transmute::<&[I], &[u32]>(indices) };
             let values = unsafe { std::mem::transmute::<&[V], &[f32]>(values) };
-            let result = unsafe { take_f32_u32_avx2(indices, values) };
+            let result = unsafe { take_u32_f32_avx2(indices, values) };
             Some(PrimitiveArray::new(
                 unsafe { std::mem::transmute::<Buffer<f32>, Buffer<V>>(result) },
                 nullability.into(),
             ))
         }
         // u64 indices, i64 values
-        (i, v) if i == std::any::TypeId::of::<u64>() && v == std::any::TypeId::of::<i64>() => {
+        (PType::U64, PType::I64) => {
             let indices = unsafe { std::mem::transmute::<&[I], &[u64]>(indices) };
             let values = unsafe { std::mem::transmute::<&[V], &[i64]>(values) };
-            let result = unsafe { take_i64_u64_avx2(indices, values) };
+            let result = unsafe { take_u64_i64_avx2(indices, values) };
             Some(PrimitiveArray::new(
                 unsafe { std::mem::transmute::<Buffer<i64>, Buffer<V>>(result) },
                 nullability.into(),
             ))
         }
         // u64 indices, f64 values
-        (i, v) if i == std::any::TypeId::of::<u64>() && v == std::any::TypeId::of::<f64>() => {
+        (PType::U64, PType::F64) => {
             let indices = unsafe { std::mem::transmute::<&[I], &[u64]>(indices) };
             let values = unsafe { std::mem::transmute::<&[V], &[f64]>(values) };
-            let result = unsafe { take_f64_u64_avx2(indices, values) };
+            let result = unsafe { take_u64_f64_avx2(indices, values) };
             Some(PrimitiveArray::new(
                 unsafe { std::mem::transmute::<Buffer<f64>, Buffer<V>>(result) },
                 nullability.into(),
@@ -95,26 +101,30 @@ where
 }
 
 /// AVX2 implementation for i32 values with u32 indices
+///
+/// # Safety:
+///
+/// Caller must ensure that all of the indices point to valid elements in the values array.
+/// Failure to do so will result in potentially accessing out of bounds memory.
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
-unsafe fn take_i32_u32_avx2(indices: &[u32], values: &[i32]) -> Buffer<i32> {
+unsafe fn take_u32_i32_avx2(indices: &[u32], values: &[i32]) -> Buffer<i32> {
     const SIMD_WIDTH: usize = 8; // 256 bits / 32 bits per element
     let indices_len = indices.len();
 
     let mut buffer =
         BufferMut::<i32>::with_capacity_aligned(indices_len, Alignment::of::<__m256i>());
 
-    let output_ptr = buffer.spare_capacity_mut().as_mut_ptr() as *mut i32;
+    let output_ptr: *mut i32 = buffer.spare_capacity_mut().as_mut_ptr().cast();
     let values_ptr = values.as_ptr();
 
     // Process chunks of 8 elements
     let chunks = indices_len / SIMD_WIDTH;
     for chunk_idx in 0..chunks {
         let offset = chunk_idx * SIMD_WIDTH;
 
-        // Load 8 u32 indices
-        let indices_vec =
-            unsafe { _mm256_loadu_si256(indices.as_ptr().add(offset) as *const __m256i) };
+        // Load the next 8 indices into a vector
+        let indices_vec = unsafe { _mm256_loadu_si256(indices.as_ptr().add(offset).cast()) };
 
         // Gather 8 i32 values using the indices
         // Scale of 4 because i32 is 4 bytes
@@ -137,14 +147,14 @@ unsafe fn take_i32_u32_avx2(indices: &[u32], values: &[i32]) -> Buffer<i32> {
 /// AVX2 implementation for f32 values with u32 indices
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
-unsafe fn take_f32_u32_avx2(indices: &[u32], values: &[f32]) -> Buffer<f32> {
+unsafe fn take_u32_f32_avx2(indices: &[u32], values: &[f32]) -> Buffer<f32> {
     const SIMD_WIDTH: usize = 8; // 256 bits / 32 bits per element
     let indices_len = indices.len();
 
     let mut buffer =
         BufferMut::<f32>::with_capacity_aligned(indices_len, Alignment::of::<__m256>());
 
-    let output_ptr = buffer.spare_capacity_mut().as_mut_ptr() as *mut f32;
+    let output_ptr: *mut f32 = buffer.spare_capacity_mut().as_mut_ptr().cast();
     let values_ptr = values.as_ptr();
 
     // Process chunks of 8 elements
@@ -177,7 +187,7 @@ unsafe fn take_f32_u32_avx2(indices: &[u32], values: &[f32]) -> Buffer<f32> {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 #[allow(clippy::cast_possible_truncation)]
-unsafe fn take_i64_u64_avx2(indices: &[u64], values: &[i64]) -> Buffer<i64> {
+unsafe fn take_u64_i64_avx2(indices: &[u64], values: &[i64]) -> Buffer<i64> {
     const SIMD_WIDTH: usize = 4; // 256 bits / 64 bits per element
     let indices_len = indices.len();
 
@@ -218,7 +228,7 @@ unsafe fn take_i64_u64_avx2(indices: &[u64], values: &[i64]) -> Buffer<i64> {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 #[allow(clippy::cast_possible_truncation)]
-unsafe fn take_f64_u64_avx2(indices: &[u64], values: &[f64]) -> Buffer<f64> {
+unsafe fn take_u64_f64_avx2(indices: &[u64], values: &[f64]) -> Buffer<f64> {
     const SIMD_WIDTH: usize = 4; // 256 bits / 64 bits per element
     let indices_len = indices.len();
 
@@ -257,7 +267,6 @@ unsafe fn take_f64_u64_avx2(indices: &[u64], values: &[f64]) -> Buffer<f64> {
 
 #[cfg(test)]
 mod tests {
-
     use super::*;
 
     #[test]