diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index dba8c38ca..ebef263ee 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -127,6 +127,18 @@ namespace xsimd return { res_r, res_i }; } + // fmas + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + struct even_lane + { + static constexpr bool get(unsigned const i, unsigned) noexcept { return (i & 1u) == 0; } + }; + const auto mask = make_batch_bool_constant(); + return fma(x, y, select(mask, neg(z), z)); + } + // hadd template ::value, void>::type*/> XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index 03088eec4..7103589b7 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -47,6 +47,8 @@ namespace xsimd template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z) noexcept; template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z) noexcept; + template XSIMD_INLINE batch frexp(const batch& x, const batch, A>& e) noexcept; template XSIMD_INLINE batch horner(const batch& self) noexcept; diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 26947dffc..353dcdb12 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -902,6 +902,18 @@ namespace xsimd { return _mm512_fmsub_pd(x, y, z); } + // fmas + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmaddsub_ps(x, y, z); + } + + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm512_fmaddsub_pd(x, y, z); + } // from bool template diff --git a/include/xsimd/arch/xsimd_fma3_avx.hpp b/include/xsimd/arch/xsimd_fma3_avx.hpp index 992625314..efdb2fc8e 100644 --- a/include/xsimd/arch/xsimd_fma3_avx.hpp +++ b/include/xsimd/arch/xsimd_fma3_avx.hpp @@ -73,6 +73,19 @@ namespace xsimd return _mm256_fmsub_pd(x, y, z); } + // fmas + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmaddsub_ps(x, y, z); + } + + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept + { + return _mm256_fmaddsub_pd(x, y, z); + } + } } diff --git a/include/xsimd/arch/xsimd_fma3_sse.hpp b/include/xsimd/arch/xsimd_fma3_sse.hpp index 9b126166a..4e2a61827 100644 --- a/include/xsimd/arch/xsimd_fma3_sse.hpp +++ b/include/xsimd/arch/xsimd_fma3_sse.hpp @@ -71,6 +71,24 @@ namespace xsimd { return _mm_fmsub_pd(x, y, z); } + // fms + template + XSIMD_INLINE batch fmas(batch const& x, + batch const& y, + batch const& z, + requires_arch>) noexcept + { + return _mm_fmaddsub_ps(x, y, z); + } + + template + XSIMD_INLINE batch fmas(batch const& x, + batch const& y, + batch const& z, + requires_arch>) noexcept + { + return _mm_fmaddsub_pd(x, y, z); + } } diff --git a/include/xsimd/arch/xsimd_fma4.hpp b/include/xsimd/arch/xsimd_fma4.hpp index 423331230..33273687c 100644 --- a/include/xsimd/arch/xsimd_fma4.hpp +++ b/include/xsimd/arch/xsimd_fma4.hpp @@ -72,6 +72,19 @@ namespace xsimd { return _mm_msub_pd(x, y, z); } + + // fmas + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_maddsub_ps(x, y, z); + } + + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return _mm_maddsub_pd(x, y, z); + } } } diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 54ac836d2..9eaf4bbcd 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -991,6 +991,21 @@ namespace xsimd return kernel::fnms(x, y, z, A {}); } + /** + * @ingroup batch_arithmetic + * + * Computes -(x*y) - z in a single instruction when possible. + * @param x a batch of integer or floating point values. + * @param y a batch of integer or floating point values. + * @param z a batch of integer or floating point values. + * @return a batch where each even-indexed element is computed as x * y - z and each odd-indexed element as x * y + z + */ + template + XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z) noexcept + { + detail::static_check_supported_config(); + return kernel::fmas(x, y, z, A {}); + } /** * @ingroup batch_fp * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index 05c13b4b8..e0d310b5f 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -711,6 +711,20 @@ struct batch_test INFO("fnms"); CHECK_BATCH_EQ(res, expected); } + // fmas + { + array_type expected; + for (std::size_t i = 0; i < expected.size(); ++i) + { + // even lanes: x*y - z, odd lanes: x*y + z + expected[i] = (i & 1u) == 0 + ? lhs[i] * rhs[i] - rhs[i] + : lhs[i] * rhs[i] + rhs[i]; + } + batch_type res = fmas(batch_lhs(), batch_rhs(), batch_rhs()); + INFO("fmas"); + CHECK_BATCH_EQ(res, expected); + } } void test_abs() const