Skip to content

Commit ee19a39

Browse files
Do not use _mm256_hadd_pd to implement reduce_add on avx
Forwarding to sse is actually faster. Related to #1107
1 parent 1200f52 commit ee19a39

File tree

1 file changed

+2
-14
lines changed

1 file changed

+2
-14
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,22 +1064,10 @@ namespace xsimd
10641064
tmp = _mm256_hadd_ps(tmp, tmp);
10651065
return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
10661066
}
1067-
template <class A>
1068-
XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
1069-
{
1070-
// rhs = (x0, x1, x2, x3)
1071-
// tmp = (x2, x3, x0, x1)
1072-
__m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
1073-
// tmp = (x2+x0, x3+x1, -, -)
1074-
tmp = _mm256_add_pd(rhs, tmp);
1075-
// tmp = (x2+x0+x3+x1, -, -, -)
1076-
tmp = _mm256_hadd_pd(tmp, tmp);
1077-
return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
1078-
}
1079-
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1067+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, double>::value, void>::type>
10801068
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
10811069
{
1082-
__m128i low, high;
1070+
typename batch<T, sse4_2>::register_type low, high;
10831071
detail::split_avx(self, low, high);
10841072
batch<T, sse4_2> blow(low), bhigh(high);
10851073
return reduce_add(blow) + reduce_add(bhigh);

0 commit comments

Comments
 (0)