Skip to content

Commit fb25021

Browse files
Faster reduce_add on avx
Just forward to sse after a split and a sum. Improve the generic reduce_add as a side effect.
1 parent bb5dd63 commit fb25021

File tree

1 file changed

+2
-20
lines changed

1 file changed

+2
-20
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1046,31 +1046,13 @@ namespace xsimd
10461046
}
10471047

10481048
// reduce_add
1049-
template <class A>
1050-
XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
1051-
{
1052-
// Warning about _mm256_hadd_ps:
1053-
// _mm256_hadd_ps(a,b) gives
1054-
// (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
1055-
// rely on a naive use of this method
1056-
// rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
1057-
// tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
1058-
__m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
1059-
// tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
1060-
tmp = _mm256_add_ps(rhs, tmp);
1061-
// tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
1062-
tmp = _mm256_hadd_ps(tmp, tmp);
1063-
// tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
1064-
tmp = _mm256_hadd_ps(tmp, tmp);
1065-
return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
1066-
}
1067-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, double>::value, void>::type>
1049+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value, void>::type>
10681050
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
10691051
{
10701052
typename batch<T, sse4_2>::register_type low, high;
10711053
detail::split_avx(self, low, high);
10721054
batch<T, sse4_2> blow(low), bhigh(high);
1073-
return reduce_add(blow) + reduce_add(bhigh);
1055+
return reduce_add(blow + bhigh);
10741056
}
10751057

10761058
// reduce_max

0 commit comments

Comments
 (0)