Skip to content

Commit a5ba45a

Browse files
WIP
1 parent e482d80 commit a5ba45a

File tree

1 file changed

+58
-23
lines changed

1 file changed

+58
-23
lines changed

include/xsimd/arch/xsimd_altivec.hpp

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -912,28 +912,65 @@ namespace xsimd
912912
return vec_sub(self.data, other.data);
913913
}
914914

915-
#if 0
916915
// swizzle
917916

918917
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
919918
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
920919
{
921-
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
922-
return _mm_shuffle_ps(self, self, index);
920+
return vec_perm(self.data, self.data,
921+
(__vector unsigned char) {
922+
4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
923+
4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
924+
4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
925+
4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 });
923926
}
924927

925928
template <class A, uint64_t V0, uint64_t V1>
926929
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<altivec>) noexcept
927930
{
928-
constexpr uint32_t index = detail::shuffle(V0, V1);
929-
return _mm_shuffle_pd(self, self, index);
931+
return vec_perm(self.data, self.data,
932+
(__vector unsigned char) {
933+
8 * V0 + 0,
934+
8 * V0 + 1,
935+
8 * V0 + 2,
936+
8 * V0 + 3,
937+
8 * V0 + 4,
938+
8 * V0 + 5,
939+
8 * V0 + 6,
940+
8 * V0 + 7,
941+
8 * V1 + 0,
942+
8 * V1 + 1,
943+
8 * V1 + 2,
944+
8 * V1 + 3,
945+
8 * V1 + 4,
946+
8 * V1 + 5,
947+
8 * V1 + 6,
948+
8 * V1 + 7,
949+
});
930950
}
931951

932952
template <class A, uint64_t V0, uint64_t V1>
933953
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<altivec>) noexcept
934954
{
935-
constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
936-
return _mm_shuffle_epi32(self, index);
955+
return vec_perm(self.data, self.data,
956+
(__vector unsigned char) {
957+
8 * V0 + 0,
958+
8 * V0 + 1,
959+
8 * V0 + 2,
960+
8 * V0 + 3,
961+
8 * V0 + 4,
962+
8 * V0 + 5,
963+
8 * V0 + 6,
964+
8 * V0 + 7,
965+
8 * V1 + 0,
966+
8 * V1 + 1,
967+
8 * V1 + 2,
968+
8 * V1 + 3,
969+
8 * V1 + 4,
970+
8 * V1 + 5,
971+
8 * V1 + 6,
972+
8 * V1 + 7,
973+
});
937974
}
938975

939976
template <class A, uint64_t V0, uint64_t V1>
@@ -945,8 +982,12 @@ namespace xsimd
945982
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
946983
XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
947984
{
948-
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
949-
return _mm_shuffle_epi32(self, index);
985+
return vec_perm(self.data, self.data,
986+
(__vector unsigned char) {
987+
4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
988+
4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
989+
4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
990+
4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 });
950991
}
951992

952993
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@@ -958,20 +999,12 @@ namespace xsimd
958999
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
9591000
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<altivec>) noexcept
9601001
{
961-
// permute within each lane
962-
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
963-
constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
964-
__m128i lo = _mm_shufflelo_epi16(self, mask_lo);
965-
__m128i hi = _mm_shufflehi_epi16(self, mask_hi);
966-
967-
__m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
968-
__m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
969-
970-
// mask to choose the right lane
971-
batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
972-
973-
// blend the two permutes
974-
return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
1002+
return vec_perm(self.data, self.data,
1003+
(__vector unsigned char) {
1004+
2 * V0 + 0, 2 * V0 + 1, 2 * V1 + 0, 2 * V1 + 1,
1005+
2 * V2 + 0, 2 * V2 + 1, 2 * V3 + 0, 2 * V3 + 1,
1006+
2 * V4 + 0, 2 * V4 + 1, 2 * V5 + 0, 2 * V5 + 1,
1007+
2 * V6 + 0, 2 * V6 + 1, 2 * V7 + 0, 2 * V7 + 1 });
9751008
}
9761009

9771010
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
@@ -980,6 +1013,8 @@ namespace xsimd
9801013
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, altivec {}));
9811014
}
9821015

1016+
#if 0
1017+
9831018
// transpose
9841019
template <class A>
9851020
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<altivec>) noexcept

0 commit comments

Comments
 (0)