simd gather function with constant stride for every type and their tests

Fikret Ardal · Fikret Ardal · commit d0353102b658 · 2025-03-06T03:47:02.000+03:00
diff --git a/c++/nda/simd/arch/AVX/functions.hpp b/c++/nda/simd/arch/AVX/functions.hpp
@@ -547,6 +547,48 @@ namespace nda::simd {
     return -(x * y + z);
   }
 
+  //Gather functions.
+  template <>
+  inline simd_i8 gather(const simd_i8::value_t *from, const long stride) {
+    simd_i8 simd_stride(static_cast<int32_t>(stride));
+    const simd_i8 multiplier({0, 1, 2, 3, 4, 5, 6, 7});
+    simd_i8 vindex = simd_stride * multiplier;
+    return simd_i8(_mm256_i32gather_epi32(from, vindex, sizeof(simd_i8::value_t)));
+  }
+
+  template <>
+  inline simd_l4 gather(const simd_l4::value_t *from, const long stride) {
+    simd_l4 simd_stride(stride);
+    const simd_l4 multiplier({0, 1, 2, 3});
+    simd_l4 vindex = simd_stride * multiplier;
+    return simd_l4(_mm256_i64gather_epi64(reinterpret_cast<const long long int*>(from), vindex, sizeof(simd_l4::value_t)));
+  }
+
+  template <>
+  inline simd_f8 gather(const simd_f8::value_t *from, const long stride) {
+    simd_i8 simd_stride(static_cast<int32_t>(stride));
+    const simd_i8 multiplier({0, 1, 2, 3, 4, 5, 6, 7});
+    simd_i8 vindex = simd_stride * multiplier;
+    return simd_f8(_mm256_i32gather_ps(from, vindex, sizeof(simd_f8::value_t)));
+  }
+
+  template <>
+  inline simd_d4 gather(const simd_d4::value_t *from, const long stride) {
+    simd_l4 simd_stride(stride);
+    const simd_l4 multiplier({0, 1, 2, 3});
+    simd_l4 vindex = simd_stride * multiplier;
+    return simd_d4(_mm256_i64gather_pd(from, vindex, sizeof(simd_d4::value_t)));
+  }
+
+  template <>
+  inline simd_cf4 gather(const simd_cf4::value_t *from, const long stride) {
+    return simd_cf4(_mm256_castpd_ps(gather<simd_d4>(reinterpret_cast<const simd_d4::value_t *>(from), stride)));
+  }
+
+  template <>
+  inline simd_cd2 gather(const simd_cd2::value_t *from, const long stride) {
+    return simd_cd2(_mm256_set_pd(from[stride].imag(), from[stride].real(), from[0].imag(), from[0].real()));
+  }
 
 } // namespace nda::simd
 #endif
diff --git a/c++/nda/simd/arch/AVX512/functions.hpp b/c++/nda/simd/arch/AVX512/functions.hpp
@@ -446,6 +446,56 @@ namespace nda::simd {
   inline simd_l8 fma_nsub(const simd_l8 &x, const simd_l8 &y, const simd_l8 &z) {
     return -(x * y + z);
   }
+  //Gather functions.
+  template <>
+ inline simd_i16 gather(const simd_i16::value_t *from, const long stride) {
+    simd_i16 simd_stride(static_cast<int32_t>(stride));
+    const simd_i16 multiplier({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    simd_i16 vindex = simd_stride * multiplier;
+    return simd_l8(_mm512_i64gather_epi64(vindex, from, sizeof(simd_l8::value_t)));
+  }
+
+  template <>
+  inline simd_l8 gather(const simd_l8::value_t *from, const long stride) {
+    simd_l8 simd_stride(stride);
+    const simd_l8 multiplier({0, 1, 2, 3, 4, 5, 6, 7});
+    simd_l8 vindex = simd_stride * multiplier;
+    return simd_l8(_mm512_i64gather_epi64(vindex, from, sizeof(simd_l8::value_t)));
+  }
+
+  template <>
+  inline simd_f16 gather(const simd_f16::value_t *from, const long stride) {
+    simd_i16 simd_stride(static_cast<int32_t>(stride));
+    const simd_i16 multiplier({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    simd_i16 vindex = simd_stride * multiplier;
+    return simd_f16(_mm512_i32gather_ps(vindex, from, sizeof(simd_f16::value_t)));
+  }
+
+  template <>
+  inline simd_d8 gather(const simd_d8::value_t *from, const long stride) {
+    simd_l8 simd_stride(stride);
+    const simd_l8 multiplier({0, 1, 2, 3, 4, 5, 6, 7});
+    simd_l8 vindex = simd_stride * multiplier;
+    return simd_d8(_mm512_i64gather_pd(vindex, from, sizeof(simd_d8::value_t)));
+  }
+
+  template <>
+  inline simd_cf8 gather(const simd_cf8::value_t *from, const long stride) {
+    return simd_cf8(_mm512_castpd_ps(gather<simd_d8>(reinterpret_cast<const simd_d8::value_t*>(from), stride)));
+  }
+
+  template <>
+  inline simd_cd4 gather(const simd_cd4::value_t *from, const long stride) {
+    simd_cd1 a,b,c,d;
+    a.load_unaligned(from);
+    b.load_unaligned(from + stride);
+    c.load_unaligned(from + 2 * stride);
+    d.load_unaligned(from + 3 * stride);
+    __m256d ab = _mm256_insertf128_pd(_mm256_castpd128_pd256(a), b, 1);
+    __m256d cd = _mm256_insertf128_pd(_mm256_castpd128_pd256(c), d, 1);
+    return simd_cd4(_mm512_insertf64x4(_mm512_castpd256_pd512(ab), cd , 1))
+
+  }
 
 } // namespace nda::simd
 #endif
diff --git a/c++/nda/simd/arch/Default/functions.hpp b/c++/nda/simd/arch/Default/functions.hpp
@@ -344,4 +344,36 @@ namespace nda::simd {
   inline simd_l1 fma_nsub(const simd_l1 &x, const simd_l1 &y, const simd_l1 &z) {
     return -(x * y + z);
   }
+
+  //Gather functions
+  template <>
+  inline simd_i1 gather(const simd_i1::value_t *from, [[maybe_unused]]const long stride) {
+    return simd_i1(from);
+  }
+
+  template <>
+  inline simd_l1 gather(const simd_l1::value_t *from, [[maybe_unused]]const long stride) {
+    return simd_l1(from);
+  }
+
+  template <>
+  inline simd_f1 gather(const simd_f1::value_t *from,[[maybe_unused]] const long stride) {
+    return simd_f1(from);
+  }
+
+  template <>
+  inline simd_d1 gather(const simd_d1::value_t *from, [[maybe_unused]] const long stride) {
+    return simd_d1(from);
+  }
+
+  template <>
+  inline simd_cf1 gather(const simd_cf1::value_t *from, [[maybe_unused]] const long stride) {
+    return simd_cf1(from);
+  }
+
+  template <>
+  inline simd_cd1_d gather(const simd_cd1_d::value_t *from,[[maybe_unused]] const long stride) {
+    return simd_cd1_d(from);
+  }
+
 } // namespace nda::simd
diff --git a/c++/nda/simd/arch/SSE/functions.hpp b/c++/nda/simd/arch/SSE/functions.hpp
@@ -367,21 +367,21 @@ namespace nda::simd {
 
   template <>
   inline simd_cf2 fma_nadd(const simd_cf2 &x, const simd_cf2 &y, const simd_cf2 &z) {
-    __m128 x_odd  = _mm_movehdup_ps(x);
-    __m128 x_even = _mm_moveldup_ps(x);
-    __m128 y_swap = _mm_permute_ps(y, NDA_SHUFFLE_MASK4(1,0,3,2));
+    __m128 x_odd         = _mm_movehdup_ps(x);
+    __m128 x_even        = _mm_moveldup_ps(x);
+    __m128 y_swap        = _mm_permute_ps(y, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
     simd_cf2 y_swap_conj = conj(simd_cf2(y_swap)); // TODO: Eigen bug create issue maybe in eigen.
-    __m128 result = _mm_fmsub_ps(x_odd, y_swap_conj, _mm_fmsub_ps(x_even, y, z));
+    __m128 result        = _mm_fmsub_ps(x_odd, y_swap_conj, _mm_fmsub_ps(x_even, y, z));
     return simd_cf2(result);
   }
 
   template <>
   inline simd_cd1 fma_nadd(const simd_cd1 &x, const simd_cd1 &y, const simd_cd1 &z) {
-    __m128d x_odd  = _mm_permute_pd(x, 0x3);
-    __m128d x_even = _mm_movedup_pd(x);
-    __m128d y_swap = _mm_permute_pd(y, 0x1);
+    __m128d x_odd        = _mm_permute_pd(x, 0x3);
+    __m128d x_even       = _mm_movedup_pd(x);
+    __m128d y_swap       = _mm_permute_pd(y, 0x1);
     simd_cd1 y_swap_conj = conj(simd_cd1(y_swap));
-    __m128d result = _mm_fmsub_pd(x_odd, y_swap_conj, _mm_fmsub_pd(x_even, y, z));
+    __m128d result       = _mm_fmsub_pd(x_odd, y_swap_conj, _mm_fmsub_pd(x_even, y, z));
     return simd_cd1(result);
   }
 
@@ -398,19 +398,19 @@ namespace nda::simd {
 
   template <>
   inline simd_cf2 fma_nsub(const simd_cf2 &x, const simd_cf2 &y, const simd_cf2 &z) {
-    __m128 x_odd  = _mm_movehdup_ps(x);
-    __m128 x_even = _mm_moveldup_ps(x);
-    __m128 y_swap = _mm_permute_ps(y, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
+    __m128 x_odd         = _mm_movehdup_ps(x);
+    __m128 x_even        = _mm_moveldup_ps(x);
+    __m128 y_swap        = _mm_permute_ps(y, NDA_SHUFFLE_MASK4(1, 0, 3, 2));
     simd_cf2 y_swap_conj = conj(simd_cf2(y_swap));
-    __m128 result = _mm_fmsub_ps(x_odd, y_swap_conj, _mm_fmadd_ps(x_even, y, z));
+    __m128 result        = _mm_fmsub_ps(x_odd, y_swap_conj, _mm_fmadd_ps(x_even, y, z));
     return simd_cf2(result);
   }
 
   template <>
   inline simd_cd1 fma_nsub(const simd_cd1 &x, const simd_cd1 &y, const simd_cd1 &z) {
-    __m128d x_odd  = _mm_permute_pd(x, 0x3);
-    __m128d x_even = _mm_movedup_pd(x);
-    __m128d y_swap = _mm_permute_pd(y, 0x1);
+    __m128d x_odd        = _mm_permute_pd(x, 0x3);
+    __m128d x_even       = _mm_movedup_pd(x);
+    __m128d y_swap       = _mm_permute_pd(y, 0x1);
     simd_cd1 y_swap_conj = conj(simd_cd1(y_swap));
 
     __m128d result = _mm_fmsub_pd(x_odd, y_swap_conj, _mm_fmadd_pd(x_even, y, z));
@@ -546,5 +546,52 @@ namespace nda::simd {
   inline simd_l2 fma_nsub(const simd_l2 &x, const simd_l2 &y, const simd_l2 &z) {
     return -(x * y + z);
   }
+  // Gather Functions with given strides in vindex.
+#ifdef __AVX2__
+  template <>
+  inline simd_i4 gather(const simd_i4::value_t *from, const long stride) {
+    simd_i4 simd_stride(static_cast<int32_t>(stride));
+    const simd_i4 multiplier({0, 1, 2, 3});
+    simd_i4 vindex = simd_stride * multiplier;
+    return simd_i4(_mm_i32gather_epi32(from, vindex, sizeof(simd_i4::value_t)));
+  }
+
+  template <>
+  inline simd_l2 gather(const simd_l2::value_t * from, const long stride) {
+    simd_l2 simd_stride(stride);
+    const simd_l2 multiplier({0, 1});
+    simd_l2 vindex = simd_stride * multiplier;
+    return simd_l2(_mm_i64gather_epi64(reinterpret_cast<const long long int*>(from), vindex, sizeof(simd_l2::value_t)));
+  }
+
+  template <>
+  inline simd_f4 gather(const simd_f4::value_t *from, const long stride) {
+    simd_i4 simd_stride(static_cast<int32_t>(stride));
+    const simd_i4 multiplier({0, 1, 2, 3});
+    simd_i4 vindex = simd_stride * multiplier;
+    return simd_f4(_mm_i32gather_ps(from, vindex, sizeof(simd_f4::value_t)));
+  }
+
+  template <>
+  inline simd_d2 gather(const simd_d2::value_t *from, const long stride) {
+    simd_l2 simd_stride(stride);
+    const simd_l2 multiplier({0, 1});
+    simd_l2 vindex = simd_stride * multiplier;
+    return simd_d2(_mm_i64gather_pd(from, vindex, sizeof(simd_d2::value_t)));
+  }
+
+  template <>
+  inline simd_cf2 gather(const simd_cf2::value_t *from, const long stride) {
+    return simd_cf2(_mm_castpd_ps(gather<simd_d2>(reinterpret_cast<const simd_d2::value_t *>(from), stride)));
+  }
+
+  template <>
+  inline simd_cd1 gather(const simd_cd1::value_t *from, [[maybe_unused]] const long stride) {
+    simd_cd1 tmp;
+    tmp.load_unaligned(from);
+    return tmp;
+  }
+
+#endif
 } // namespace nda::simd
 #endif
diff --git a/c++/nda/simd/arch/functions_forward.hpp b/c++/nda/simd/arch/functions_forward.hpp
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <array>
+
 namespace nda::simd {
   template <typename T>
   T abs(const T &);
@@ -28,19 +30,19 @@ namespace nda::simd {
   template <typename T>
   typename T::value_t reduce_mul(const T &);
 
-  //TODO: Implement these
-  template<typename T>
-  T fma_add(const T&, const T&, const T&);
-
-  template<typename T>
-  T fma_sub(const T&, const T&, const T&);
+  template <typename T>
+  T fma_add(const T &, const T &, const T &);
 
-  template<typename T>
-  T fma_nadd(const T&, const T&, const T&);
+  template <typename T>
+  T fma_sub(const T &, const T &, const T &);
 
-  template<typename T>
-  T fma_nsub(const T&, const T&, const T&);
+  template <typename T>
+  T fma_nadd(const T &, const T &, const T &);
 
+  template <typename T>
+  T fma_nsub(const T &, const T &, const T &);
 
+  template <typename T>
+  T gather(const typename T::value_t *, const long);
 
 } // namespace nda::simd
diff --git a/test/c++/nda_simd.cpp b/test/c++/nda_simd.cpp