Skip to content

Commit 222c647

Browse files
Faster reduce_add on avx
Just forward to sse after a split and a sum.
1 parent bb5dd63 commit 222c647

File tree

1 file changed

+1
-19
lines changed

1 file changed

+1
-19
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,25 +1046,7 @@ namespace xsimd
10461046
}
10471047

10481048
// reduce_add
1049-
template <class A>
1050-
XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
1051-
{
1052-
// Warning about _mm256_hadd_ps:
1053-
// _mm256_hadd_ps(a,b) gives
1054-
// (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
1055-
// rely on a naive use of this method
1056-
// rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
1057-
// tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
1058-
__m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
1059-
// tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
1060-
tmp = _mm256_add_ps(rhs, tmp);
1061-
// tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
1062-
tmp = _mm256_hadd_ps(tmp, tmp);
1063-
// tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
1064-
tmp = _mm256_hadd_ps(tmp, tmp);
1065-
return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
1066-
}
1067-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, double>::value, void>::type>
1049+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
10681050
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
10691051
{
10701052
typename batch<T, sse4_2>::register_type low, high;

0 commit comments

Comments
 (0)