@@ -912,28 +912,65 @@ namespace xsimd
912912 return vec_sub (self.data , other.data );
913913 }
914914
915- #if 0
916915 // swizzle
917916
918917 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
919918 XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
920919 {
921- constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
922- return _mm_shuffle_ps(self, self, index);
920+ return vec_perm (x.data , x.data ,
921+ (__vector unsigned char ) {
922+ 4 * V0 + 0 , 4 * V0 + 1 , 4 * V0 + 2 , 4 * V0 + 3 ,
923+ 4 * V1 + 0 , 4 * V1 + 1 , 4 * V1 + 2 , 4 * V1 + 3 ,
924+ 4 * V2 + 0 , 4 * V2 + 1 , 4 * V2 + 2 , 4 * V2 + 3 ,
925+ 4 * V3 + 0 , 4 * V3 + 1 , 4 * V3 + 2 , 4 * V3 + 3 });
923926 }
924927
925928 template <class A , uint64_t V0, uint64_t V1>
926929 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<altivec>) noexcept
927930 {
928- constexpr uint32_t index = detail::shuffle(V0, V1);
929- return _mm_shuffle_pd(self, self, index);
931+ return vec_perm (x.data , x.data ,
932+ (__vector unsigned char ) {
933+ 8 * V0 + 0 ,
934+ 8 * V0 + 1 ,
935+ 8 * V0 + 2 ,
936+ 8 * V0 + 3 ,
937+ 8 * V0 + 4 ,
938+ 8 * V0 + 5 ,
939+ 8 * V0 + 6 ,
940+ 8 * V0 + 7 ,
941+ 8 * V1 + 0 ,
942+ 8 * V1 + 1 ,
943+ 8 * V1 + 2 ,
944+ 8 * V1 + 3 ,
945+ 8 * V1 + 4 ,
946+ 8 * V1 + 5 ,
947+ 8 * V1 + 6 ,
948+ 8 * V1 + 7 ,
949+ });
930950 }
931951
932952 template <class A , uint64_t V0, uint64_t V1>
933953 XSIMD_INLINE batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<altivec>) noexcept
934954 {
935- constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
936- return _mm_shuffle_epi32(self, index);
955+ return vec_perm (x.data , x.data ,
956+ (__vector unsigned char ) {
957+ 8 * V0 + 0 ,
958+ 8 * V0 + 1 ,
959+ 8 * V0 + 2 ,
960+ 8 * V0 + 3 ,
961+ 8 * V0 + 4 ,
962+ 8 * V0 + 5 ,
963+ 8 * V0 + 6 ,
964+ 8 * V0 + 7 ,
965+ 8 * V1 + 0 ,
966+ 8 * V1 + 1 ,
967+ 8 * V1 + 2 ,
968+ 8 * V1 + 3 ,
969+ 8 * V1 + 4 ,
970+ 8 * V1 + 5 ,
971+ 8 * V1 + 6 ,
972+ 8 * V1 + 7 ,
973+ });
937974 }
938975
939976 template <class A , uint64_t V0, uint64_t V1>
@@ -945,8 +982,12 @@ namespace xsimd
945982 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
946983 XSIMD_INLINE batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
947984 {
948- constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
949- return _mm_shuffle_epi32(self, index);
985+ return vec_perm (x.data , x.data ,
986+ (__vector unsigned char ) {
987+ 4 * V0 + 0 , 4 * V0 + 1 , 4 * V0 + 2 , 4 * V0 + 3 ,
988+ 4 * V1 + 0 , 4 * V1 + 1 , 4 * V1 + 2 , 4 * V1 + 3 ,
989+ 4 * V2 + 0 , 4 * V2 + 1 , 4 * V2 + 2 , 4 * V2 + 3 ,
990+ 4 * V3 + 0 , 4 * V3 + 1 , 4 * V3 + 2 , 4 * V3 + 3 });
950991 }
951992
952993 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@@ -958,20 +999,12 @@ namespace xsimd
958999 template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
9591000 XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<altivec>) noexcept
9601001 {
961- // permute within each lane
962- constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
963- constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
964- __m128i lo = _mm_shufflelo_epi16(self, mask_lo);
965- __m128i hi = _mm_shufflehi_epi16(self, mask_hi);
966-
967- __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
968- __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
969-
970- // mask to choose the right lane
971- batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
972-
973- // blend the two permutes
974- return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
1002+ return vec_perm (x.data , x.data ,
1003+ (__vector unsigned char ) {
1004+ 2 * V0 + 0 , 2 * V0 + 1 , 2 * V1 + 0 , 2 * V1 + 1 ,
1005+ 2 * V2 + 0 , 2 * V2 + 1 , 2 * V3 + 0 , 2 * V3 + 1 ,
1006+ 2 * V4 + 0 , 2 * V4 + 1 , 2 * V5 + 0 , 2 * V5 + 1 ,
1007+ 2 * V6 + 0 , 2 * V6 + 1 , 2 * V7 + 0 , 2 * V7 + 1 });
9751008 }
9761009
9771010 template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
@@ -980,6 +1013,8 @@ namespace xsimd
9801013 return bitwise_cast<int16_t >(swizzle (bitwise_cast<uint16_t >(self), mask, altivec {}));
9811014 }
9821015
1016+ #if 0
1017+
9831018 // transpose
9841019 template <class A>
9851020 XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<altivec>) noexcept
0 commit comments