Do not use hadd_pd to implement reduce_add

serge-sans-paille · serge-sans-paille · commit 634b18f1f527 · 2025-04-17T14:24:57.000+02:00
It's generally slower than the sse2 version due to a latency of 5 (!) for hadd_pd. Related to #1107
diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp
@@ -50,12 +50,6 @@ namespace xsimd
             __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
             return _mm_cvtss_f32(tmp1);
         }
-        template <class A>
-        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
-        {
-            __m128d tmp0 = _mm_hadd_pd(self, self);
-            return _mm_cvtsd_f64(tmp0);
-        }
 
     }
 

Original file line number	Diff line number	Diff line change
`@@ -50,12 +50,6 @@ namespace xsimd`
`50`	`50`	`__m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);`
`51`	`51`	`return _mm_cvtss_f32(tmp1);`
`52`	`52`	`}`
`53`		`- template <class A>`
`54`		`- XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept`
`55`		`- {`
`56`		`- __m128d tmp0 = _mm_hadd_pd(self, self);`
`57`		`- return _mm_cvtsd_f64(tmp0);`
`58`		`- }`
`59`	`53`
`60`	`54`	`}`
`61`	`55`