@@ -1064,22 +1064,10 @@ namespace xsimd
10641064 tmp = _mm256_hadd_ps (tmp, tmp);
10651065 return _mm_cvtss_f32 (_mm256_extractf128_ps (tmp, 0 ));
10661066 }
1067- template <class A >
1068- XSIMD_INLINE double reduce_add (batch<double , A> const & rhs, requires_arch<avx>) noexcept
1069- {
1070- // rhs = (x0, x1, x2, x3)
1071- // tmp = (x2, x3, x0, x1)
1072- __m256d tmp = _mm256_permute2f128_pd (rhs, rhs, 1 );
1073- // tmp = (x2+x0, x3+x1, -, -)
1074- tmp = _mm256_add_pd (rhs, tmp);
1075- // tmp = (x2+x0+x3+x1, -, -, -)
1076- tmp = _mm256_hadd_pd (tmp, tmp);
1077- return _mm_cvtsd_f64 (_mm256_extractf128_pd (tmp, 0 ));
1078- }
1079- template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
1067+ template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, double >::value, void >::type>
10801068 XSIMD_INLINE T reduce_add (batch<T, A> const & self, requires_arch<avx>) noexcept
10811069 {
1082- __m128i low, high;
1070+ typename batch<T, sse4_2>::register_type low, high;
10831071 detail::split_avx (self, low, high);
10841072 batch<T, sse4_2> blow (low), bhigh (high);
10851073 return reduce_add (blow) + reduce_add (bhigh);
0 commit comments