WIP

serge-sans-paille · serge-sans-paille · commit 16be7377cc26 · 2025-07-24T22:01:41.000+02:00
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
@@ -790,121 +790,112 @@ namespace xsimd
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, altivec {});
         }
-#if 0
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
         XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<altivec>) noexcept
         {
-            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
-            // shuffle within lane
-            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
-                return _mm_shuffle_ps(x, y, smask);
-
-            // shuffle within opposite lane
-            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
-                return _mm_shuffle_ps(y, x, smask);
-            return shuffle(x, y, mask, common {});
+            return vec_perm(x, y,
+                            (__vector unsigned char) {
+                                4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3,
+                                4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3,
+                                4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3,
+                                4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 });
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
         XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<altivec>) noexcept
         {
-            constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
-            // shuffle within lane
-            if (I0 < 2 && I1 >= 2)
-                return _mm_shuffle_pd(x, y, smask);
-
-            // shuffle within opposite lane
-            if (I0 >= 2 && I1 < 2)
-                return _mm_shuffle_pd(y, x, smask);
-            return shuffle(x, y, mask, common {});
+            return vec_perm(x, y,
+                (__vector unsigned char) {
+                8 * I0 + 0, 8 * I0 + 1, 8 * I0 + 2, 8 * I0 + 3, 8 * I0 + 4, 8 * I0 + 5, 8 * I0 + 6, 8 * I0 + 7,
+                8 * I1 + 0, 8 * I1 + 1, 8 * I1 + 2, 8 * I1 + 3, 8 * I1 + 4, 8 * I1 + 5, 8 * I1 + 6, 8 * I1 + 7,
+                );
         }
-#endif
 
         // sqrt
         template <class A>
         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
-            return vec_sqrt(val.data);
+                return vec_sqrt(val.data);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept
         {
-            return vec_sqrt(val.data);
+                return vec_sqrt(val.data);
         }
 
         // slide_left
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return (typename batch<T, A>::register_type)vec_sll((__vector unsigned char)x.data, vec_splats((uint32_t)N));
+                return (typename batch<T, A>::register_type)vec_sll((__vector unsigned char)x.data, vec_splats((uint32_t)N));
         }
 
         // slide_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return (typename batch<T, A>::register_type)vec_srl((__vector unsigned char)x.data, vec_splats((uint32_t)N));
+                return (typename batch<T, A>::register_type)vec_srl((__vector unsigned char)x.data, vec_splats((uint32_t)N));
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value && sizeof(T) != 8, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_adds(self.data, other.data);
+                return vec_adds(self.data, other.data);
         }
 
         // set
         template <class A, class T, class... Values>
         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
-            static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
-            return typename batch<T, A>::register_type { values... };
+                static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
+                return typename batch<T, A>::register_type { values... };
         }
 
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
-            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
-            return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
+                static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+                return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
         }
 
         // ssub
 
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value && sizeof(T) == 1, void>::type>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_subs(self.data, other.data);
+                return vec_subs(self.data, other.data);
         }
 
         // store_aligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
+                return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
         }
 
         // store_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            auto tmp = vec_perm(*reinterpret_cast<const __vector unsigned char*>(&self.data), *reinterpret_cast<const __vector unsigned char*>(&self.data), vec_lvsr(0, (unsigned char*)mem));
-            vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);
-            vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);
-            vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);
-            vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);
-            vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);
-            vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);
-            vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);
+                auto tmp = vec_perm(*reinterpret_cast<const __vector unsigned char*>(&self.data), *reinterpret_cast<const __vector unsigned char*>(&self.data), vec_lvsr(0, (unsigned char*)mem));
+                vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);
+                vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);
+                vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);
+                vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);
+                vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);
+                vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);
+                vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);
         }
 
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_sub(self.data, other.data);
+                return vec_sub(self.data, other.data);
         }
 
 #if 0
@@ -1024,16 +1015,16 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_mergeh(self.data, other.data);
+                return vec_mergeh(self.data, other.data);
         }
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_mergel(self.data, other.data);
+                return vec_mergel(self.data, other.data);
+        }
         }
     }
-}
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -790,121 +790,112 @@ namespace xsimd`
`790`	`790`	`{`
`791`	`791`	`return select(batch_bool<T, A> { Values... }, true_br, false_br, altivec {});`
`792`	`792`	`}`
`793`		`-#if 0`
`794`	`793`
`795`	`794`	`// shuffle`
`796`	`795`	`template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>`
`797`	`796`	`XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<altivec>) noexcept`
`798`	`797`	`{`
`799`		`- constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);`
`800`		`- // shuffle within lane`
`801`		`- if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)`
`802`		`- return _mm_shuffle_ps(x, y, smask);`
`803`		`-`
`804`		`- // shuffle within opposite lane`
`805`		`- if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)`
`806`		`- return _mm_shuffle_ps(y, x, smask);`
`807`		`- return shuffle(x, y, mask, common {});`
	`798`	`+ return vec_perm(x, y,`
	`799`	`+ (__vector unsigned char) {`
	`800`	`+ 4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3,`
	`801`	`+ 4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3,`
	`802`	`+ 4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3,`
	`803`	`+ 4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 });`
`808`	`804`	`}`
`809`	`805`
`810`	`806`	`template <class A, class ITy, ITy I0, ITy I1>`
`811`	`807`	`XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<altivec>) noexcept`
`812`	`808`	`{`
`813`		`- constexpr uint32_t smask = detail::mod_shuffle(I0, I1);`
`814`		`- // shuffle within lane`
`815`		`- if (I0 < 2 && I1 >= 2)`
`816`		`- return _mm_shuffle_pd(x, y, smask);`
`817`		`-`
`818`		`- // shuffle within opposite lane`
`819`		`- if (I0 >= 2 && I1 < 2)`
`820`		`- return _mm_shuffle_pd(y, x, smask);`
`821`		`- return shuffle(x, y, mask, common {});`
	`809`	`+ return vec_perm(x, y,`
	`810`	`+ (__vector unsigned char) {`
	`811`	`+ 8 * I0 + 0, 8 * I0 + 1, 8 * I0 + 2, 8 * I0 + 3, 8 * I0 + 4, 8 * I0 + 5, 8 * I0 + 6, 8 * I0 + 7,`
	`812`	`+ 8 * I1 + 0, 8 * I1 + 1, 8 * I1 + 2, 8 * I1 + 3, 8 * I1 + 4, 8 * I1 + 5, 8 * I1 + 6, 8 * I1 + 7,`
	`813`	`+ );`
`822`	`814`	`}`
`823`		`-#endif`
`824`	`815`
`825`	`816`	`// sqrt`
`826`	`817`	`template <class A>`
`827`	`818`	`XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept`
`828`	`819`	`{`
`829`		`- return vec_sqrt(val.data);`
	`820`	`+ return vec_sqrt(val.data);`
`830`	`821`	`}`
`831`	`822`
`832`	`823`	`template <class A>`
`833`	`824`	`XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept`
`834`	`825`	`{`
`835`		`- return vec_sqrt(val.data);`
	`826`	`+ return vec_sqrt(val.data);`
`836`	`827`	`}`
`837`	`828`
`838`	`829`	`// slide_left`
`839`	`830`	`template <size_t N, class A, class T>`
`840`	`831`	`XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept`
`841`	`832`	`{`
`842`		`- return (typename batch<T, A>::register_type)vec_sll((__vector unsigned char)x.data, vec_splats((uint32_t)N));`
	`833`	`+ return (typename batch<T, A>::register_type)vec_sll((__vector unsigned char)x.data, vec_splats((uint32_t)N));`
`843`	`834`	`}`
`844`	`835`
`845`	`836`	`// slide_right`
`846`	`837`	`template <size_t N, class A, class T>`
`847`	`838`	`XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept`
`848`	`839`	`{`
`849`		`- return (typename batch<T, A>::register_type)vec_srl((__vector unsigned char)x.data, vec_splats((uint32_t)N));`
	`840`	`+ return (typename batch<T, A>::register_type)vec_srl((__vector unsigned char)x.data, vec_splats((uint32_t)N));`
`850`	`841`	`}`
`851`	`842`
`852`	`843`	`// sadd`
`853`	`844`	`template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value && sizeof(T) != 8, void>::type>`
`854`	`845`	`XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept`
`855`	`846`	`{`
`856`		`- return vec_adds(self.data, other.data);`
	`847`	`+ return vec_adds(self.data, other.data);`
`857`	`848`	`}`
`858`	`849`
`859`	`850`	`// set`
`860`	`851`	`template <class A, class T, class... Values>`
`861`	`852`	`XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, Values... values) noexcept`
`862`	`853`	`{`
`863`		`- static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");`
`864`		`- return typename batch<T, A>::register_type { values... };`
	`854`	`+ static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");`
	`855`	`+ return typename batch<T, A>::register_type { values... };`
`865`	`856`	`}`
`866`	`857`
`867`	`858`	`template <class A, class T, class... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`868`	`859`	`XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept`
`869`	`860`	`{`
`870`		`- static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");`
`871`		`- return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };`
	`861`	`+ static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");`
	`862`	`+ return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };`
`872`	`863`	`}`
`873`	`864`
`874`	`865`	`// ssub`
`875`	`866`
`876`	`867`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value && sizeof(T) == 1, void>::type>`
`877`	`868`	`XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept`
`878`	`869`	`{`
`879`		`- return vec_subs(self.data, other.data);`
	`870`	`+ return vec_subs(self.data, other.data);`
`880`	`871`	`}`
`881`	`872`
`882`	`873`	`// store_aligned`
`883`	`874`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`884`	`875`	`XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept`
`885`	`876`	`{`
`886`		`- return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));`
	`877`	`+ return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));`
`887`	`878`	`}`
`888`	`879`
`889`	`880`	`// store_unaligned`
`890`	`881`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`891`	`882`	`XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept`
`892`	`883`	`{`
`893`		`- auto tmp = vec_perm(reinterpret_cast<const __vector unsigned char>(&self.data), reinterpret_cast<const __vector unsigned char>(&self.data), vec_lvsr(0, (unsigned char*)mem));`
`894`		`- vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);`
`895`		`- vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);`
`896`		`- vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);`
`897`		`- vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);`
`898`		`- vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);`
`899`		`- vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);`
`900`		`- vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);`
	`884`	`+ auto tmp = vec_perm(reinterpret_cast<const __vector unsigned char>(&self.data), reinterpret_cast<const __vector unsigned char>(&self.data), vec_lvsr(0, (unsigned char*)mem));`
	`885`	`+ vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);`
	`886`	`+ vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);`
	`887`	`+ vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);`
	`888`	`+ vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);`
	`889`	`+ vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);`
	`890`	`+ vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);`
	`891`	`+ vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);`
`901`	`892`	`}`
`902`	`893`
`903`	`894`	`// sub`
`904`	`895`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`905`	`896`	`XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept`
`906`	`897`	`{`
`907`		`- return vec_sub(self.data, other.data);`
	`898`	`+ return vec_sub(self.data, other.data);`
`908`	`899`	`}`
`909`	`900`
`910`	`901`	`#if 0`
`@@ -1024,16 +1015,16 @@ namespace xsimd`
`1024`	`1015`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`1025`	`1016`	`XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept`
`1026`	`1017`	`{`
`1027`		`- return vec_mergeh(self.data, other.data);`
	`1018`	`+ return vec_mergeh(self.data, other.data);`
`1028`	`1019`	`}`
`1029`	`1020`
`1030`	`1021`	`// zip_lo`
`1031`	`1022`	`template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>`
`1032`	`1023`	`XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept`
`1033`	`1024`	`{`
`1034`		`- return vec_mergel(self.data, other.data);`
	`1025`	`+ return vec_mergel(self.data, other.data);`
	`1026`	`+ }`
`1035`	`1027`	`}`
`1036`	`1028`	`}`
`1037`		`-}`
`1038`	`1029`
`1039`	`1030`	`#endif`