AMReX-Codes
diff --git a/‎Docs/sphinx_documentation/source/BuildingAMReX.rst‎
Lines changed: 4 additions & 0 deletions b/‎Docs/sphinx_documentation/source/BuildingAMReX.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Src/Base/AMReX_GpuComplex.H‎
Lines changed: 12 additions & 8 deletions b/‎Src/Base/AMReX_GpuComplex.H‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎Src/Base/AMReX_GpuLaunch.H‎
Lines changed: 1 addition & 0 deletions b/‎Src/Base/AMReX_GpuLaunch.H‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Src/Base/AMReX_GpuLaunchFunctsC.H‎
Lines changed: 43 additions & 0 deletions b/‎Src/Base/AMReX_GpuLaunchFunctsC.H‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎Src/Base/AMReX_Math.H‎
Lines changed: 40 additions & 0 deletions b/‎Src/Base/AMReX_Math.H‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎Src/Base/AMReX_SIMD.H‎
Lines changed: 155 additions & 0 deletions b/‎Src/Base/AMReX_SIMD.H‎
Lines changed: 155 additions & 0 deletions
@@ -455,6 +455,8 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_MPI                    |  Build with MPI support                         | YES                     | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_SIMD                   |  Enable SIMD Primitives (using vir::stdx::simd) | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_OMP                    |  Build with OpenMP support                      | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_GPU_BACKEND            |  Build with on-node, accelerated GPU backend    | NONE                    | NONE, SYCL, HIP, CUDA |
@@ -683,6 +685,8 @@ A list of AMReX component names and related configure options are shown in the t
    +------------------------------+-----------------+
    | AMReX_MPI                    | MPI             |
    +------------------------------+-----------------+
+   | AMReX_SIMD                   | SIMD            |
+   +------------------------------+-----------------+
    | AMReX_OMP                    | OMP             |
    +------------------------------+-----------------+
    | AMReX_GPU_BACKEND            | CUDA, HIP, SYCL |
 
@@ -53,8 +53,9 @@ struct alignas(2*sizeof(T)) GpuComplex
     /**
      * \brief Add a real number to this complex number.
      */
+    template <typename U>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    GpuComplex<T>& operator+= (const T& a_t) noexcept
+    GpuComplex<T>& operator+= (const U& a_t) noexcept
     {
         m_real += a_t;
         return *this;
@@ -63,8 +64,9 @@ struct alignas(2*sizeof(T)) GpuComplex
    /**
      * \brief Subtract a real number from this complex number.
      */
+    template <typename U>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    GpuComplex<T>& operator-= (const T& a_t) noexcept
+    GpuComplex<T>& operator-= (const U& a_t) noexcept
     {
         m_real -= a_t;
         return *this;
@@ -73,8 +75,9 @@ struct alignas(2*sizeof(T)) GpuComplex
     /**
      * \brief Multiply this complex number by a real.
      */
+    template <typename U>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    GpuComplex<T>& operator*= (const T& a_t) noexcept
+    GpuComplex<T>& operator*= (const U& a_t) noexcept
     {
         m_real *= a_t;
         m_imag *= a_t;
@@ -84,8 +87,9 @@ struct alignas(2*sizeof(T)) GpuComplex
     /**
      * \brief Divide this complex number by a real.
      */
+    template <typename U>
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    GpuComplex<T>& operator/= (const T& a_t) noexcept
+    GpuComplex<T>& operator/= (const U& a_t) noexcept
     {
         m_real /= a_t;
         m_imag /= a_t;
@@ -247,9 +251,9 @@ GpuComplex<T> operator+ (const T& a_x, const GpuComplex<T>& a_y) noexcept
 /**
  * \brief Multiply two complex numbers.
  */
-template <typename T>
+template <typename T, typename U>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-GpuComplex<T> operator* (const GpuComplex<T>& a_x, const GpuComplex<T>& a_y) noexcept
+GpuComplex<T> operator* (const GpuComplex<T>& a_x, const GpuComplex<U>& a_y) noexcept
 {
     GpuComplex<T> r = a_x;
     r *= a_y;
@@ -259,9 +263,9 @@ GpuComplex<T> operator* (const GpuComplex<T>& a_x, const GpuComplex<T>& a_y) noe
 /**
  * \brief Multiply a complex number by a real one.
  */
-template <typename T>
+template <typename T, typename U>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-GpuComplex<T> operator* (const GpuComplex<T>& a_x, const T& a_y) noexcept
+GpuComplex<T> operator* (const GpuComplex<T>& a_x, const U& a_y) noexcept
 {
     GpuComplex<T> r = a_x;
     r *= a_y;
 
@@ -234,6 +234,7 @@ namespace Gpu {
 #else
 #include <AMReX_GpuLaunchMacrosC.H>
 #include <AMReX_GpuLaunchFunctsC.H>
+#include <AMReX_SIMD.H>
 #endif
 
 #include <AMReX_GpuLaunch.nolint.H>
 
@@ -4,6 +4,24 @@
 
 namespace amrex {
 
+/** Helper type to store/access the SIMD width in ParallelForSIMD lambdas
+ *
+ * Use instead of int as the running index i. Used to pass the
+ * SIMD WIDTH as compile-time meta-data into a called function/method.
+ *
+ * @tparam WIDTH SIMD width in elements
+ * @tparam N index type (integer)
+ */
+template<int WIDTH, class N=int>
+struct SIMDindex
+{
+    /** SIMD width in elements */
+    static constexpr int width = WIDTH;
+
+    /** The linear loop index of ParallelFor(SIMD) */
+    N index = 0;
+};
+
 namespace detail {
 
     // call_f_scalar_handler
@@ -175,6 +193,31 @@ void ParallelFor (T n, L&& f) noexcept
     ParallelFor(n, std::forward<L>(f));
 }
 
+/** ParallelFor with a SIMD Width (in elements)
+ *
+ * SIMD load/Write-back operations need to be performed before/after calling this.
+ *
+ * @tparam WIDTH SIMD width in elements
+ * @tparam N index type (integer)
+ * @tparam L function/functor to call per SIMD set of elements
+ */
+template <int WIDTH, typename N, typename L, typename M=std::enable_if_t<std::is_integral_v<N>> >
+AMREX_ATTRIBUTE_FLATTEN_FOR
+void ParallelForSIMD (N n, L const& f) noexcept
+{
+    N i = 0;
+    // vectorize full lanes
+    for (; i + WIDTH <= n; i+=WIDTH) {
+        f(SIMDindex<WIDTH, N>{i});
+    }
+    // scalar handling of the remainder
+    // note: we could make the remainder calls faster, by repeatedly
+    //       decreasing the SIMD width by 2 until we reach 1
+    for (; i < n; ++i) {
+        f(SIMDindex<1, N>{i});
+    }
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
 
@@ -5,6 +5,7 @@
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_Extension.H>
 #include <AMReX_INT.H>
+#include <AMReX_SIMD.H>
 #include <AMReX_REAL.H>
 #include <cmath>
 #include <cstdlib>
@@ -133,6 +134,24 @@ namespace detail {
 }
 
 //! Return sine and cosine of given number
+template<typename T_Real>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+std::pair<T_Real,T_Real> sincos (T_Real x)
+{
+#ifdef AMREX_USE_SIMD
+    using namespace amrex::simd::stdx;
+#else
+    using namespace std;
+#endif
+
+    std::pair<T_Real,T_Real> r;
+    r.first = sin(x);
+    r.second = cos(x);
+    return r;
+}
+
+//! Return sine and cosine of given number
+template<>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincos (double x)
 {
@@ -147,6 +166,7 @@ std::pair<double,double> sincos (double x)
 }
 
 //! Return sine and cosine of given number
+template<>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<float,float> sincos (float x)
 {
@@ -161,6 +181,25 @@ std::pair<float,float> sincos (float x)
 }
 
 //! Return sin(pi*x) and cos(pi*x) given x
+template<typename T_Real>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+std::pair<T_Real,T_Real> sincospi (T_Real x)
+{
+#ifdef AMREX_USE_SIMD
+    using namespace amrex::simd::stdx;
+#else
+    using namespace std;
+#endif
+
+    T_Real const px = pi<T_Real>() * x;
+    std::pair<T_Real,T_Real> r;
+    r.first = sin(px);
+    r.second = cos(px);
+    return r;
+}
+
+//! Return sin(pi*x) and cos(pi*x) given x
+template<>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<double,double> sincospi (double x)
 {
@@ -175,6 +214,7 @@ std::pair<double,double> sincospi (double x)
 }
 
 //! Return sin(pi*x) and cos(pi*x) given x
+template<>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 std::pair<float,float> sincospi (float x)
 {
 
@@ -0,0 +1,155 @@
+#ifndef AMREX_SIMD_H_
+#define AMREX_SIMD_H_
+
+#include <AMReX_Config.H>
+
+#include <AMReX_REAL.H>
+
+#ifdef AMREX_USE_SIMD
+// TODO make SIMD provider configurable: VIR (C++17 TS2) or C++26 (later)
+#   include <vir/simd.h>  // includes SIMD TS2 header <experimental/simd>
+#endif
+
+#include <cstdint>
+#include <type_traits>
+
+
+namespace amrex::simd
+{
+    // TODO make SIMD provider configurable: VIR (C++17 TS2) or C++26 (later)
+    //namespace stdx = std::experimental;
+    // for https://en.cppreference.com/w/cpp/experimental/simd/simd_cast.html
+    namespace stdx {
+#ifdef AMREX_USE_SIMD
+        using namespace std::experimental;
+        using namespace std::experimental::__proposed;
+        using namespace vir::stdx;
+#else
+        // fallback implementations for functions that are commonly used in portable code paths
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        bool any_of (bool const v) { return v; }
+#endif
+    }
+
+    // TODO: move to AMReX_REAL.H?
+
+#ifdef AMREX_USE_SIMD
+    // TODO: not sure why std::experimental::simd_abi::native<T> does not work, so we use this long version
+    constexpr auto native_simd_size_real = std::experimental::native_simd<amrex::Real>::size();
+    constexpr auto native_simd_size_particlereal = std::experimental::native_simd<amrex::ParticleReal>::size();
+
+    // Note: to make use of not only vector registers but also ILP, user might want to use * 2 or more of the native size
+    //       for selected compute kernels.
+    // TODO Check if a default with * 2 or similar is sensible.
+    template<int SIMD_WIDTH = native_simd_size_real>
+    using SIMDReal = std::experimental::fixed_size_simd<amrex::Real, SIMD_WIDTH>;
+
+    template<int SIMD_WIDTH = native_simd_size_particlereal>
+    using SIMDParticleReal = std::experimental::fixed_size_simd<amrex::ParticleReal, SIMD_WIDTH>;
+
+    // Type that has the same amount of IdCpu SIMD elements as the SIMDParticleReal type
+    template<typename T_ParticleReal = SIMDParticleReal<>>
+    using SIMDIdCpu = std::experimental::rebind_simd_t<std::uint64_t, T_ParticleReal>;
+#else
+    constexpr auto native_simd_size_real = 1;
+    constexpr auto native_simd_size_particlereal = 1;
+
+    template<int SIMD_WIDTH = native_simd_size_real>
+    using SIMDReal = amrex::Real;
+
+    template<int SIMD_WIDTH = native_simd_size_particlereal>
+    using SIMDParticleReal = amrex::ParticleReal;
+
+    // Type that has the same amount of IdCpu SIMD elements as the SIMDParticleReal type
+    template<typename T_ParticleReal = SIMDParticleReal<>>
+    using SIMDIdCpu = std::uint64_t;
+#endif
+
+    namespace detail {
+        struct InternalVectorized {};
+    }
+
+    /** Mixin Helper Class
+     *
+     * Use this class as a mixin (base) class to simplify writing functors that support/do not support
+     * ParallelForSIMD.
+     *
+     * Example:
+     * ```c++
+     * struct Compute : public Vectorized<>
+     * {
+     *     template<typename T_Real>
+     *     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+     *     void operator() (T_Real & AMREX_RESTRICT x) { ... }
+     * };
+     *
+     * // ... call site below
+     * {
+     *     Compute c;
+     *
+     *     if constexpr (amrex::simd::is_vectorized<Compute>) {
+     *         ParallelForSIMD<...>(np, c).
+     *     } else {
+     *         ParallelFor(np, c);
+     *     }
+     * }
+     * ```
+     */
+    template<int SIMD_WIDTH = native_simd_size_real>
+    struct
+    Vectorized : detail::InternalVectorized
+    {
+        //! SIMD width in elements
+        static constexpr int simd_width = SIMD_WIDTH;
+    };
+
+    /** Check if a Functor Class works with amrex::ParallelForSIMD
+     *
+     * @see amrex::simd::Vectorized
+     */
+    template<typename T>
+    constexpr bool is_vectorized = std::is_base_of_v<detail::InternalVectorized, T>;
+
+    /** Check if a function argument is declared as non-const
+     *
+     * Use in conjunction with conditional write-back logic from vector registers, e.g.,
+     *
+     * ```c++
+     * template<typename T_Real>
+     * void compute (T_Real & AMREX_RESTRICT x,
+     *               T_Real const & AMREX_RESTRICT y) { ... }
+     *
+     * part_x.copy_from(&m_part_x[i], stdx::element_aligned);
+     * part_y.copy_from(&m_part_y[i], stdx::element_aligned);
+     *
+     * compute(part_x, part_y);
+     *
+     * if constexpr (is_nth_arg_non_const(compute<double>, 0))
+     *     part_x.copy_to(&m_part_x[i], stdx::element_aligned);
+     * if constexpr (is_nth_arg_non_const(compute<double>, 1))
+     *     part_y.copy_to(&m_part_y[i], stdx::element_aligned);
+     */
+    template<typename R, typename... Args>
+    constexpr bool is_nth_arg_non_const (R(*)(Args...), int n)
+    {
+        constexpr bool val_arr[sizeof...(Args)] {!std::is_const_v<std::remove_reference_t<Args>>...};
+        return val_arr[n];
+    }
+    // same for functors (const/non-const ::operator() members)
+    template<typename C, typename R, typename... Args>
+    constexpr bool is_nth_arg_non_const (R(C::*)(Args...), int n)
+    {
+        constexpr bool val_arr[sizeof...(Args)] {!std::is_const_v<std::remove_reference_t<Args>>...};
+        return val_arr[n];
+    }
+    template<typename C, typename R, typename... Args>
+    constexpr bool is_nth_arg_non_const (R(C::*)(Args...) const, int n)
+    {
+        constexpr bool val_arr[sizeof...(Args)] {!std::is_const_v<std::remove_reference_t<Args>>...};
+        return val_arr[n];
+    }
+
+} // namespace amrex::simd
+
+#endif