Fix: portable SIMD take implementation (#5654)

connortsui20 · web-flow · commit c0d44ef0e736 · 2025-12-09T11:17:59.000-05:00
It turns out we are not checking `RUSTFLAGS="--cfg vortex_nightly" cargo
+nightly nextest run` on our CI. I'll add that to our actions in a
followup PR.

Also adds some non-trivial tests.

Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/vortex-array/src/arrays/primitive/compute/take/portable.rs b/vortex-array/src/arrays/primitive/compute/take/portable.rs
@@ -33,10 +33,6 @@ use crate::validity::Validity;
 
 pub(super) struct TakeKernelPortableSimd;
 
-// SIMD types larger than the SIMD register size are beneficial for
-// performance as this leads to better instruction level parallelism.
-const SIMD_WIDTH: usize = 64;
-
 impl TakeImpl for TakeKernelPortableSimd {
     fn take(
         &self,
@@ -47,7 +43,7 @@ impl TakeImpl for TakeKernelPortableSimd {
         if array.ptype() == PType::F16 {
             // Special handling for f16 to treat as opaque u16
             let decoded = match_each_unsigned_integer_ptype!(unsigned_indices.ptype(), |C| {
-                portable::take_portable_simd::<u16, C, SIMD_WIDTH>(
+                portable::take_portable_simd::<u16, C, { portable::SIMD_WIDTH }>(
                     array.reinterpret_cast(PType::U16).as_slice(),
                     unsigned_indices.as_slice(),
                 )
@@ -58,7 +54,7 @@ impl TakeImpl for TakeKernelPortableSimd {
         } else {
             match_each_unsigned_integer_ptype!(unsigned_indices.ptype(), |C| {
                 match_each_native_simd_ptype!(array.ptype(), |V| {
-                    let decoded = portable::take_portable_simd::<V, C, SIMD_WIDTH>(
+                    let decoded = portable::take_portable_simd::<V, C, { portable::SIMD_WIDTH }>(
                         array.as_slice(),
                         unsigned_indices.as_slice(),
                     );
@@ -68,17 +64,3 @@ impl TakeImpl for TakeKernelPortableSimd {
         }
     }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::take_portable_simd;
-
-    #[test]
-    fn test_take_out_of_bounds() {
-        let indices = vec![2_000_000u32; 64];
-        let values = vec![1i32];
-
-        let result = take_portable_simd::<u32, i32, 64>(&indices, &values);
-        assert_eq!(result.as_slice(), [0i32; 64]);
-    }
-}
diff --git a/vortex-compute/Cargo.toml b/vortex-compute/Cargo.toml
@@ -29,6 +29,7 @@ vortex-vector = { workspace = true }
 arrow-array = { workspace = true, optional = true }
 arrow-buffer = { workspace = true, optional = true }
 arrow-schema = { workspace = true, optional = true }
+half = { workspace = true }
 log = { workspace = true }
 multiversion = { workspace = true }
 num-traits = { workspace = true }
diff --git a/vortex-compute/src/lib.rs b/vortex-compute/src/lib.rs
@@ -3,6 +3,7 @@
 
 //! A collection of compute functions primarily for operating over Vortex vectors.
 
+#![cfg_attr(vortex_nightly, feature(portable_simd))]
 #![deny(missing_docs)]
 #![deny(clippy::missing_panics_doc)]
 #![deny(clippy::missing_safety_doc)]
diff --git a/vortex-compute/src/take/slice/mod.rs b/vortex-compute/src/take/slice/mod.rs
@@ -30,6 +30,7 @@ impl<T: NativePType, I: UnsignedPType> Take<[I]> for &[T] {
             }
         }
 
+        #[allow(unreachable_code, reason = "`vortex_nightly` path returns early")]
         take_scalar(self, indices)
     }
 }
diff --git a/vortex-compute/src/take/slice/portable.rs b/vortex-compute/src/take/slice/portable.rs
@@ -6,6 +6,7 @@
 #![cfg(vortex_nightly)]
 
 use std::mem::MaybeUninit;
+use std::mem::size_of;
 use std::mem::transmute;
 use std::simd;
 use std::simd::num::SimdUint;
@@ -17,27 +18,55 @@ use vortex_buffer::BufferMut;
 use vortex_dtype::NativePType;
 use vortex_dtype::PType;
 use vortex_dtype::UnsignedPType;
+use vortex_dtype::match_each_native_simd_ptype;
+use vortex_dtype::match_each_unsigned_integer_ptype;
+
+/// SIMD types larger than the SIMD register size are beneficial for
+/// performance as this leads to better instruction level parallelism.
+pub const SIMD_WIDTH: usize = 64;
 
 /// Takes the specified indices into a new [`Buffer`] using portable SIMD.
+///
+/// This function handles the type matching required to satisfy `SimdElement` bounds.
+/// For `f16` values, it reinterprets them as `u16` since `f16` doesn't implement `SimdElement`.
 #[inline]
-pub fn take_portable<T, I>(buffer: &[T], indices: &[I]) -> Buffer<T>
-where
-    T: NativePType + simd::SimdElement,
-    I: UnsignedPType + simd::SimdElement,
-{
+pub fn take_portable<T: NativePType, I: UnsignedPType>(buffer: &[T], indices: &[I]) -> Buffer<T> {
     if T::PTYPE == PType::F16 {
+        assert_eq!(size_of::<half::f16>(), size_of::<T>());
+
         // Since Rust does not actually support 16-bit floats, we first reinterpret the data as
         // `u16` integers.
+        // SAFETY: We know that f16 has the same bit pattern as u16, so this transmute is fine to
+        // make.
         let u16_slice: &[u16] =
             unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const u16, buffer.len()) };
+        return take_with_indices(u16_slice, indices).cast_into::<T>();
+    }
 
-        let taken_u16 = take_portable_simd::<u16, I, SIMD_WIDTH>(u16_slice, indices);
-        let taken_f16 = taken_u16.cast_into::<T>();
+    match_each_native_simd_ptype!(T::PTYPE, |TC| {
+        assert_eq!(size_of::<TC>(), size_of::<T>());
 
-        taken_f16
-    } else {
-        take_portable_simd::<T, I, SIMD_WIDTH>(buffer, indices)
-    }
+        // SAFETY: This is essentially a no-op that tricks the compiler into adding the
+        // `simd::SimdElement` bound we need to call `take_with_indices`.
+        let buffer: &[TC] =
+            unsafe { std::slice::from_raw_parts(buffer.as_ptr() as *const TC, buffer.len()) };
+        take_with_indices(buffer, indices).cast_into::<T>()
+    })
+}
+
+/// Helper that matches on index type and calls `take_portable_simd`.
+///
+/// We separate this code out from above to add the [`simd::SimdElement`] constraint.
+#[inline]
+fn take_with_indices<T: NativePType + simd::SimdElement, I: UnsignedPType>(
+    buffer: &[T],
+    indices: &[I],
+) -> Buffer<T> {
+    match_each_unsigned_integer_ptype!(I::PTYPE, |IC| {
+        let indices: &[IC] =
+            unsafe { std::slice::from_raw_parts(indices.as_ptr() as *const IC, indices.len()) };
+        take_portable_simd::<T, IC, SIMD_WIDTH>(buffer, indices)
+    })
 }
 
 /// Takes elements from an array using SIMD indexing.
@@ -110,4 +139,126 @@ mod tests {
         let result = take_portable_simd::<i32, u32, 64>(&values, &indices);
         assert_eq!(result.as_slice(), [0i32; 64]);
     }
+
+    /// Tests SIMD gather with a mix of sequential, strided, and repeated indices. This exercises
+    /// irregular access patterns that stress the gather operation.
+    #[test]
+    fn test_take_mixed_access_patterns() {
+        // Create a values array with distinct elements.
+        let values: Vec<i64> = (0..256).map(|i| i * 100).collect();
+
+        // Build indices with mixed patterns:
+        // - Sequential access (0, 1, 2, ...)
+        // - Strided access (0, 4, 8, ...)
+        // - Repeated indices (same index multiple times)
+        // - Reverse order
+        let mut indices: Vec<u32> = Vec::with_capacity(200);
+
+        // Sequential: indices 0..64.
+        indices.extend(0u32..64);
+        // Strided by 4: 0, 4, 8, ..., 252.
+        indices.extend((0u32..64).map(|i| i * 4));
+        // Repeated: index 42 repeated 32 times.
+        indices.extend(std::iter::repeat(42u32).take(32));
+        // Reverse: 255, 254, ..., 216.
+        indices.extend((216u32..256).rev());
+
+        let result = take_portable_simd::<i64, u32, 64>(&values, &indices);
+        let result_slice = result.as_slice();
+
+        // Verify sequential portion.
+        for i in 0..64 {
+            assert_eq!(result_slice[i], (i as i64) * 100, "sequential at index {i}");
+        }
+
+        // Verify strided portion.
+        for i in 0..64 {
+            assert_eq!(
+                result_slice[64 + i],
+                (i as i64) * 4 * 100,
+                "strided at index {i}"
+            );
+        }
+
+        // Verify repeated portion.
+        for i in 0..32 {
+            assert_eq!(result_slice[128 + i], 42 * 100, "repeated at index {i}");
+        }
+
+        // Verify reverse portion.
+        for i in 0..40 {
+            assert_eq!(
+                result_slice[160 + i],
+                (255 - i as i64) * 100,
+                "reverse at index {i}"
+            );
+        }
+    }
+
+    /// Tests that the scalar remainder path works correctly when the number of indices is not
+    /// evenly divisible by the SIMD lane count.
+    #[test]
+    fn test_take_with_remainder() {
+        let values: Vec<u16> = (0..1000).collect();
+
+        // Use 64 + 37 = 101 indices to test both the SIMD loop (64 elements) and the scalar
+        // remainder (37 elements).
+        let indices: Vec<u8> = (0u8..101).collect();
+
+        let result = take_portable_simd::<u16, u8, 64>(&values, &indices);
+        let result_slice = result.as_slice();
+
+        assert_eq!(result_slice.len(), 101);
+
+        // Verify all elements.
+        for i in 0..101 {
+            assert_eq!(result_slice[i], i as u16, "mismatch at index {i}");
+        }
+
+        // Also test with exactly 1 remainder element.
+        let indices_one_remainder: Vec<u8> = (0u8..65).collect();
+        let result_one = take_portable_simd::<u16, u8, 64>(&values, &indices_one_remainder);
+        assert_eq!(result_one.as_slice().len(), 65);
+        assert_eq!(result_one.as_slice()[64], 64);
+    }
+
+    /// Tests gather with large 64-bit values and various index types to ensure no truncation
+    /// occurs during the operation.
+    #[test]
+    fn test_take_large_values_no_truncation() {
+        // Create values near the edges of i64 range.
+        let values: Vec<i64> = vec![
+            i64::MIN,
+            i64::MIN + 1,
+            -1_000_000_000_000i64,
+            -1,
+            0,
+            1,
+            1_000_000_000_000i64,
+            i64::MAX - 1,
+            i64::MAX,
+        ];
+
+        // Indices that access each value multiple times in different orders.
+        let indices: Vec<u16> = vec![
+            0, 8, 1, 7, 2, 6, 3, 5, 4, // Forward-backward interleaved.
+            8, 8, 8, 0, 0, 0, // Repeated extremes.
+            4, 4, 4, 4, 4, 4, 4, 4, // Repeated zero.
+            0, 1, 2, 3, 4, 5, 6, 7, 8, // Sequential.
+            8, 7, 6, 5, 4, 3, 2, 1, 0, // Reverse.
+            // Pad to 64 to ensure we hit the SIMD path.
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3,
+        ];
+
+        let result = take_portable_simd::<i64, u16, 64>(&values, &indices);
+        let result_slice = result.as_slice();
+
+        // Verify each result matches the expected value.
+        for (i, &idx) in indices.iter().enumerate() {
+            assert_eq!(
+                result_slice[i], values[idx as usize],
+                "mismatch at position {i} for index {idx}"
+            );
+        }
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ impl<T: NativePType, I: UnsignedPType> Take<[I]> for &[T] {`
`30`	`30`	`}`
`31`	`31`	`}`
`32`	`32`
	`33`	+ #[allow(unreachable_code, reason = "`vortex_nightly` path returns early")]
`33`	`34`	`take_scalar(self, indices)`
`34`	`35`	`}`
`35`	`36`	`}`