Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,18 @@ namespace xsimd
return { res_r, res_i };
}

// fmas
template <class A, class T>
XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<common>) noexcept
{
struct even_lane
{
static constexpr bool get(unsigned const i, unsigned) noexcept { return (i & 1u) == 0; }
};
const auto mask = make_batch_bool_constant<T, even_lane, A>();
return fma(x, y, select(mask, neg(z), z));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is select a better option compared to a multiply?

Copy link
Contributor Author

@DiamonDinoia DiamonDinoia Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think multiply is overly complicated. If we extend batch_constant to double (remove bitwise operators) then it might be worth it.

struct imag_neg {
        static constexpr int get(const unsigned i, unsigned) noexcept { return (i & 1u) ? 1.0 : -1.0; }
    };

    // Generator first, then arch
    const auto mask = xsimd::batch_cast<double>(xsimd::make_batch_constant<xsimd::as_integer_t<double>, imag_neg, arch>().as_batch());

Also, I do not see a big difference on my machine:
image

(swizzle/multiply vs swizzle/select)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for going all the way down to the experiment!

}

// hadd
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
XSIMD_INLINE T hadd(batch<T, A> const& self, requires_arch<common>) noexcept
Expand Down
2 changes: 2 additions & 0 deletions include/xsimd/arch/common/xsimd_common_details.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ namespace xsimd
template <class T, class A>
XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
template <class T, class A>
XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
template <class T, class A>
XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
template <class T, class A, uint64_t... Coefs>
XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept;
Expand Down
12 changes: 12 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,18 @@ namespace xsimd
{
return _mm512_fmsub_pd(x, y, z);
}
// fmas
template <class A>
XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmaddsub_ps(x, y, z);
}

template <class A>
XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmaddsub_pd(x, y, z);
}

// from bool
template <class A, class T>
Expand Down
13 changes: 13 additions & 0 deletions include/xsimd/arch/xsimd_fma3_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ namespace xsimd
return _mm256_fmsub_pd(x, y, z);
}

// fmas
template <class A>
XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmaddsub_ps(x, y, z);
}

template <class A>
XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmaddsub_pd(x, y, z);
}

}

}
Expand Down
18 changes: 18 additions & 0 deletions include/xsimd/arch/xsimd_fma3_sse.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,24 @@ namespace xsimd
{
return _mm_fmsub_pd(x, y, z);
}
// fms
template <class A>
XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x,
batch<float, A> const& y,
batch<float, A> const& z,
requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmaddsub_ps(x, y, z);
}

template <class A>
XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x,
batch<double, A> const& y,
batch<double, A> const& z,
requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmaddsub_pd(x, y, z);
}

}

Expand Down
13 changes: 13 additions & 0 deletions include/xsimd/arch/xsimd_fma4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ namespace xsimd
{
return _mm_msub_pd(x, y, z);
}

// fmas
template <class A>
XSIMD_INLINE batch<float, A> fmas(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_maddsub_ps(x, y, z);
}

template <class A>
XSIMD_INLINE batch<double, A> fmas(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_maddsub_pd(x, y, z);
}
}

}
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,21 @@ namespace xsimd
return kernel::fnms<A>(x, y, z, A {});
}

/**
* @ingroup batch_arithmetic
*
* Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
* @param x a batch of integer or floating point values.
* @param y a batch of integer or floating point values.
* @param z a batch of integer or floating point values.
* @return a batch where each even-indexed element is computed as <tt>x * y - z</tt> and each odd-indexed element as <tt>x * y + z</tt>
*/
template <class T, class A>
XSIMD_INLINE batch<T, A> fmas(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
{
detail::static_check_supported_config<T, A>();
return kernel::fmas<A>(x, y, z, A {});
}
/**
* @ingroup batch_fp
*
Expand Down
14 changes: 14 additions & 0 deletions test/test_batch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,20 @@ struct batch_test
INFO("fnms");
CHECK_BATCH_EQ(res, expected);
}
// fmas
{
array_type expected;
for (std::size_t i = 0; i < expected.size(); ++i)
{
// even lanes: x*y - z, odd lanes: x*y + z
expected[i] = (i & 1u) == 0
? lhs[i] * rhs[i] - rhs[i]
: lhs[i] * rhs[i] + rhs[i];
}
batch_type res = fmas(batch_lhs(), batch_rhs(), batch_rhs());
INFO("fmas");
CHECK_BATCH_EQ(res, expected);
}
}

void test_abs() const
Expand Down