@@ -414,23 +414,23 @@ namespace xsimd
414414 return vec_cmpgt (self, other);
415415 }
416416
417- #if 0
418-
419417 // haddp
420418 template <class A >
421419 XSIMD_INLINE batch<float , A> haddp (batch<float , A> const * row, requires_arch<altivec>) noexcept
422420 {
423- __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
424- __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
425- __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
426- tmp0 = _mm_add_ps(tmp0, tmp1);
427- tmp1 = _mm_unpacklo_ps(row[2], row[3]);
428- tmp1 = _mm_add_ps(tmp1, tmp2);
429- tmp2 = _mm_movehl_ps(tmp1, tmp0);
430- tmp0 = _mm_movelh_ps(tmp0, tmp1);
431- return _mm_add_ps(tmp0, tmp2);
421+ auto tmp0 = vec_mergee (row[0 ], row[1 ]); // v00 v10 v02 v12
422+ auto tmp1 = vec_mergeo (row[0 ], row[1 ]); // v01 v11 v03 v13
423+ auto tmp4 = vec_add (tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13)
424+
425+ auto tmp2 = vec_mergee (row[2 ], row[3 ]); // v20 v30 v22 v32
426+ auto tmp3 = vec_mergeo (row[2 ], row[3 ]); // v21 v31 v23 v33
427+ auto tmp5 = vec_add (tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33)
428+
429+ auto tmp6 = vec_permi (tmp4, tmp5, 0x0 ); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
430+ auto tmp7 = vec_permi (tmp4, tmp5, 0x3 ); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33)
431+
432+ return vec_add (tmp6, tmp7);
432433 }
433- #endif
434434
435435 // incr_if
436436 template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
0 commit comments