Add Neon, Neonx2, Neonx4 and Neonx8 SIMD implementations

ictrobot · ictrobot · commit 0a0cde027eb7 · 2025-04-02T21:36:04.000+01:00
When ran with one thread, neonx2 is ~31% faster on my Raspberry Pi 3 B than the previous fastest implementation (array128).

When ran with one thread, neonx8 is ~34% faster on my Raspberry Pi 5 B than the previous fastest implementation (array4096).
diff --git a/crates/utils/src/multiversion.rs b/crates/utils/src/multiversion.rs
@@ -89,6 +89,14 @@ macro_rules! multiversion {
                 AVX2x4 => unsafe { $name::avx2x4::$name($($arg_name),*) },
                 #[cfg(all(feature = "unsafe", feature = "all-simd", any(target_arch = "x86", target_arch = "x86_64")))]
                 AVX2x8 => unsafe { $name::avx2x8::$name($($arg_name),*) },
+                #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+                Neon => $name::neon::$name($($arg_name),*),
+                #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                Neonx2 => $name::neonx2::$name($($arg_name),*),
+                #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                Neonx4 => $name::neonx4::$name($($arg_name),*),
+                #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                Neonx8 => $name::neonx8::$name($($arg_name),*),
             }
         }
     };
@@ -184,6 +192,42 @@ macro_rules! multiversion {
 
             $crate::multiversion!{@helper target_feature(enable = "avx2") $($tail)*}
         }
+
+        /// [`multiversion!`] neon implementation.
+        #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+        pub mod neon {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {super::*, $($($path::)+neon::*),*};
+
+            $($tail)*
+        }
+
+        /// [`multiversion!`] neonx2 implementation.
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        pub mod neonx2 {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {super::*, $($($path::)+neonx2::*),*};
+
+            $($tail)*
+        }
+
+        /// [`multiversion!`] neonx4 implementation.
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        pub mod neonx4 {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {super::*, $($($path::)+neonx4::*),*};
+
+            $($tail)*
+        }
+
+        /// [`multiversion!`] neonx8 implementation.
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        pub mod neonx8 {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {super::*, $($($path::)+neonx8::*),*};
+
+            $($tail)*
+        }
     };
 
     // Microbenchmark for dynamic dispatch
@@ -219,6 +263,14 @@ macro_rules! multiversion {
                         AVX2x4 => unsafe { avx2x4::$name() },
                         #[cfg(all(feature = "unsafe", feature = "all-simd", any(target_arch = "x86", target_arch = "x86_64")))]
                         AVX2x8 => unsafe { avx2x8::$name() },
+                        #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+                        Neon => neon::$name(),
+                        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                        Neonx2 => neonx2::$name(),
+                        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                        Neonx4 => neonx4::$name(),
+                        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+                        Neonx8 => neonx8::$name(),
                     });
                     (start.elapsed(), x)
                 })
@@ -395,6 +447,46 @@ macro_rules! multiversion_test {
 
             unsafe { $body }
         }
+
+        #[test]
+        #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+        $(#[$m])*
+        fn neon() {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {$($($path::)+neon::*),*};
+
+            $body
+        }
+
+        #[test]
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        $(#[$m])*
+        fn neonx2() {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {$($($path::)+neonx2::*),*};
+
+            $body
+        }
+
+        #[test]
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        $(#[$m])*
+        fn neonx4() {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {$($($path::)+neonx4::*),*};
+
+            $body
+        }
+
+        #[test]
+        #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+        $(#[$m])*
+        fn neonx8() {
+            #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+            use {$($($path::)+neonx8::*),*};
+
+            $body
+        }
     };
 
     (
@@ -471,6 +563,40 @@ macro_rules! multiversion_test {
                 $crate::multiversion_test!(@expr { $($tail)+ });
             }
         }
+
+        #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+        {
+            {
+                #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+                use {$($($path::)+neon::*),*};
+
+                $crate::multiversion_test!(@expr { $($tail)+ });
+            }
+
+            #[cfg(feature = "all-simd")]
+            {
+                #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+                use {$($($path::)+neonx2::*),*};
+
+                $crate::multiversion_test!(@expr { $($tail)+ });
+            }
+
+            #[cfg(feature = "all-simd")]
+            {
+                #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+                use {$($($path::)+neonx4::*),*};
+
+                $crate::multiversion_test!(@expr { $($tail)+ });
+            }
+
+            #[cfg(feature = "all-simd")]
+            {
+                #[allow(clippy::allow_attributes, unused_imports, clippy::wildcard_imports)]
+                use {$($($path::)+neonx8::*),*};
+
+                $crate::multiversion_test!(@expr { $($tail)+ });
+            }
+        }
     };
     (@expr $e:expr) => { $e }
 }
@@ -536,6 +662,14 @@ versions_impl! {
     AVX2x4 if std::arch::is_x86_feature_detected!("avx2"),
     #[cfg(all(feature = "unsafe", feature = "all-simd", any(target_arch = "x86", target_arch = "x86_64")))]
     AVX2x8 if std::arch::is_x86_feature_detected!("avx2"),
+    #[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+    Neon,
+    #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+    Neonx2,
+    #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+    Neonx4,
+    #[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+    Neonx8,
 }
 
 static OVERRIDE: OnceLock<Option<Version>> = OnceLock::new();
diff --git a/crates/utils/src/simd/mod.rs b/crates/utils/src/simd/mod.rs
@@ -20,4 +20,12 @@ pub use avx2_impl::avx2;
 ))]
 pub use avx2_impl::{avx2x2, avx2x4, avx2x8};
 
+#[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+#[path = "neon.rs"]
+mod neon_impl;
+#[cfg(all(feature = "unsafe", target_arch = "aarch64"))]
+pub use neon_impl::neon;
+#[cfg(all(feature = "unsafe", feature = "all-simd", target_arch = "aarch64"))]
+pub use neon_impl::{neonx2, neonx4, neonx8};
+
 pub mod scalar;
diff --git a/crates/utils/src/simd/neon.rs b/crates/utils/src/simd/neon.rs
@@ -0,0 +1,149 @@
+//! Neon vector implementations.
+
+use std::array::from_fn;
+use std::ops::{Add, BitAnd, BitOr, BitXor, Not};
+
+#[expect(clippy::wildcard_imports)]
+use std::arch::aarch64::*;
+
+/// Neon [u32] vector implementation.
+#[derive(Clone, Copy)]
+#[repr(transparent)]
+pub struct U32Vector<const V: usize, const L: usize>([uint32x4_t; V]);
+
+impl<const V: usize, const L: usize> From<[u32; L]> for U32Vector<V, L> {
+    #[inline]
+    fn from(value: [u32; L]) -> Self {
+        Self(from_fn(|i| unsafe { vld1q_u32(value[i * 4..].as_ptr()) }))
+    }
+}
+
+impl<const V: usize, const L: usize> From<U32Vector<V, L>> for [u32; L] {
+    #[inline]
+    fn from(value: U32Vector<V, L>) -> Self {
+        let mut result = [0; L];
+        for (&v, r) in value.0.iter().zip(result.chunks_exact_mut(4)) {
+            unsafe {
+                vst1q_u32(r.as_mut_ptr(), v);
+            }
+        }
+        result
+    }
+}
+
+impl<const V: usize, const L: usize> Add for U32Vector<V, L> {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        Self(from_fn(|i| unsafe { vaddq_u32(self.0[i], rhs.0[i]) }))
+    }
+}
+
+impl<const V: usize, const L: usize> BitAnd for U32Vector<V, L> {
+    type Output = Self;
+
+    #[inline]
+    fn bitand(self, rhs: Self) -> Self::Output {
+        Self(from_fn(|i| unsafe { vandq_u32(self.0[i], rhs.0[i]) }))
+    }
+}
+
+impl<const V: usize, const L: usize> BitOr for U32Vector<V, L> {
+    type Output = Self;
+
+    #[inline]
+    fn bitor(self, rhs: Self) -> Self::Output {
+        Self(from_fn(|i| unsafe { vorrq_u32(self.0[i], rhs.0[i]) }))
+    }
+}
+
+impl<const V: usize, const L: usize> BitXor for U32Vector<V, L> {
+    type Output = Self;
+
+    #[inline]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        Self(from_fn(|i| unsafe { veorq_u32(self.0[i], rhs.0[i]) }))
+    }
+}
+
+impl<const V: usize, const L: usize> Not for U32Vector<V, L> {
+    type Output = Self;
+
+    #[inline]
+    fn not(self) -> Self::Output {
+        Self(from_fn(|i| unsafe {
+            veorq_u32(self.0[i], vdupq_n_u32(!0))
+        }))
+    }
+}
+
+impl<const V: usize, const L: usize> U32Vector<V, L> {
+    pub const LANES: usize = {
+        assert!(V * 4 == L);
+        L
+    };
+
+    #[inline]
+    #[must_use]
+    pub fn andnot(self, rhs: Self) -> Self {
+        Self(from_fn(|i| unsafe { vbicq_u32(self.0[i], rhs.0[i]) }))
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn splat(v: u32) -> Self {
+        Self([unsafe { vdupq_n_u32(v) }; V])
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn rotate_left(self, n: u32) -> Self {
+        Self(from_fn(|i| unsafe {
+            #[expect(clippy::cast_possible_wrap)]
+            vorrq_u32(
+                vshlq_u32(self.0[i], vdupq_n_s32(n as i32)),
+                vshlq_u32(self.0[i], vdupq_n_s32(-(32 - n as i32))),
+            )
+        }))
+    }
+}
+
+/// Vector implementations using a single Neon vector.
+pub mod neon {
+    /// The name of this backend.
+    pub const SIMD_BACKEND: &str = "neon";
+
+    /// Neon vector with four [u32] lanes.
+    pub type U32Vector = super::U32Vector<1, 4>;
+}
+
+/// Vector implementations using two Neon vectors.
+#[cfg(feature = "all-simd")]
+pub mod neonx2 {
+    /// The name of this backend.
+    pub const SIMD_BACKEND: &str = "neonx2";
+
+    /// Two Neon vectors with eight total [u32] lanes.
+    pub type U32Vector = super::U32Vector<2, 8>;
+}
+
+/// Vector implementations using four Neon vectors.
+#[cfg(feature = "all-simd")]
+pub mod neonx4 {
+    /// The name of this backend.
+    pub const SIMD_BACKEND: &str = "neonx4";
+
+    /// Four Neon vectors with sixteen total [u32] lanes.
+    pub type U32Vector = super::U32Vector<4, 16>;
+}
+
+/// Vector implementations using eight Neon vectors.
+#[cfg(feature = "all-simd")]
+pub mod neonx8 {
+    /// The name of this backend.
+    pub const SIMD_BACKEND: &str = "neonx8";
+
+    /// Eight Neon vectors with thirty-two total [u32] lanes.
+    pub type U32Vector = super::U32Vector<8, 32>;
+}