Skip to content

Commit 7ede28f

Browse files
committed
Add dynamic swizzle for uin8_t on avx2
1 parent 9d41ad9 commit 7ede28f

File tree

1 file changed

+28
-0
lines changed

1 file changed

+28
-0
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,6 +1096,34 @@ namespace xsimd
10961096
}
10971097

10981098
// swizzle (dynamic mask)
1099+
template <class A>
1100+
XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx2>) noexcept
1101+
{
1102+
// swap lanes
1103+
__m256i swapped = _mm256_permute2x128_si256(self, self, 0x01); // [high | low]
1104+
1105+
// normalize mask taking modulo 16
1106+
batch<uint8_t, A> half_mask = mask & 0b1111u;
1107+
1108+
// permute bytes within each lane (AVX2 only)
1109+
__m256i r0 = _mm256_shuffle_epi8(self, half_mask);
1110+
__m256i r1 = _mm256_shuffle_epi8(swapped, half_mask);
1111+
1112+
// select lane by the mask index divided by 16
1113+
constexpr auto lane = batch_constant<
1114+
uint8_t, A,
1115+
00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00,
1116+
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {};
1117+
batch_bool<uint8_t, A> blend_mask = (mask & 0b10000u) != lane;
1118+
return _mm256_blendv_epi8(r0, r1, blend_mask);
1119+
}
1120+
1121+
template <class A, typename T, detail::enable_sized_t<T, 1> = 0>
1122+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx>) noexcept
1123+
{
1124+
return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), mask));
1125+
}
1126+
10991127
template <class A>
11001128
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
11011129
{

0 commit comments

Comments
 (0)