xtensor-stack · serge-sans-paille · Jul 2, 2025 · Jul 2, 2025 · serge-sans-paille · Jul 2, 2025
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -127,6 +127,18 @@ namespace xsimd
             return { res_r, res_i };
         }
 
+        // fmas
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<common>) noexcept
+        {
+            struct even_lane
+            {
+                static constexpr bool get(unsigned const i, unsigned) noexcept { return (i & 1u) == 0; }
+            };
+            const auto mask = make_batch_bool_constant<T, even_lane, A>();
+            return fma(x, y, select(mask, neg(z), z));
+        }
+
         // hadd
         template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
         XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept

diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp
@@ -47,6 +47,8 @@ namespace xsimd
     template <class T, class A>
     XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
     template <class T, class A>
+    XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
     XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
     template <class T, class A, uint64_t... Coefs>
     XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept;

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -902,6 +902,18 @@ namespace xsimd
         {
             return _mm512_fmsub_pd(x, y, z);
         }
+        // fmas
+        template <class A>
+        XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmaddsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmaddsub_pd(x, y, z);
+        }
 
         // from bool
         template <class A, class T>

diff --git a/include/xsimd/arch/xsimd_fma3_avx.hpp b/include/xsimd/arch/xsimd_fma3_avx.hpp
@@ -73,6 +73,19 @@ namespace xsimd
             return _mm256_fmsub_pd(x, y, z);
         }
 
+        // fmas
+        template <class A>
+        XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmaddsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmaddsub_pd(x, y, z);
+        }
+
     }
 
 }

diff --git a/include/xsimd/arch/xsimd_fma3_sse.hpp b/include/xsimd/arch/xsimd_fma3_sse.hpp
@@ -71,6 +71,24 @@ namespace xsimd
         {
             return _mm_fmsub_pd(x, y, z);
         }
+        // fms
+        template <class A>
+        XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x,
+                                          batch<float, A> const& y,
+                                          batch<float, A> const& z,
+                                          requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmaddsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x,
+                                           batch<double, A> const& y,
+                                           batch<double, A> const& z,
+                                           requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmaddsub_pd(x, y, z);
+        }
 
     }
 

diff --git a/include/xsimd/arch/xsimd_fma4.hpp b/include/xsimd/arch/xsimd_fma4.hpp
@@ -72,6 +72,19 @@ namespace xsimd
         {
             return _mm_msub_pd(x, y, z);
         }
+
+        // fmas
+        template <class A>
+        XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_maddsub_ps(x, y, z);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_maddsub_pd(x, y, z);
+        }
     }
 
 }

diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -991,6 +991,21 @@ namespace xsimd
         return kernel::fnms<A>(x, y, z, A {});
     }
 
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return  a batch where each even-indexed element is computed as <tt>x * y - z</tt> and each odd-indexed element as <tt>x * y + z</tt>
+     */
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fmas<A>(x, y, z, A {});
+    }
     /**
      * @ingroup batch_fp
      *

diff --git a/test/test_batch.cpp b/test/test_batch.cpp
@@ -711,6 +711,20 @@ struct batch_test
             INFO("fnms");
             CHECK_BATCH_EQ(res, expected);
         }
+        // fmas
+        {
+            array_type expected;
+            for (std::size_t i = 0; i < expected.size(); ++i)
+            {
+                // even lanes: x*y - z, odd lanes: x*y + z
+                expected[i] = (i & 1u) == 0
+                    ? lhs[i] * rhs[i] - rhs[i]
+                    : lhs[i] * rhs[i] + rhs[i];
+            }
+            batch_type res = fmas(batch_lhs(), batch_rhs(), batch_rhs());
+            INFO("fmas");
+            CHECK_BATCH_EQ(res, expected);
+        }
     }
 
     void test_abs() const