Skip to content

Commit ca01f17

Browse files
committed
Add dynamic swizzle for uin16_t on avx2
1 parent 7ede28f commit ca01f17

File tree

1 file changed

+18
-1
lines changed

1 file changed

+18
-1
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1119,11 +1119,28 @@ namespace xsimd
11191119
}
11201120

11211121
template <class A, typename T, detail::enable_sized_t<T, 1> = 0>
1122-
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx>) noexcept
1122+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx2>) noexcept
11231123
{
11241124
return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), mask));
11251125
}
11261126

1127+
template <class A>
1128+
XSIMD_INLINE batch<uint16_t, A> swizzle(
1129+
batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx2>) noexcept
1130+
{
1131+
// No blend/shuffle for 16 bits, we need to use the 8 bits version
1132+
const auto self_bytes = bitwise_cast<uint8_t>(self);
1133+
// If a mask entry is k, we want 2k in low byte and 2k+1 in high byte
1134+
const auto mask_2k_2kp1 = bitwise_cast<uint8_t>((mask << 1) | (mask << 9) | 0x100);
1135+
return bitwise_cast<uint16_t>(swizzle(self_bytes, mask_2k_2kp1, requires_arch<avx2>));
1136+
}
1137+
1138+
template <class A, typename T, detail::enable_sized_t<T, 2> = 0>
1139+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint16_t, A> const& mask, requires_arch<avx2>) noexcept
1140+
{
1141+
return bitwise_cast<T>(swizzle(bitwise_cast<uint16_t>(self), mask));
1142+
}
1143+
11271144
template <class A>
11281145
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
11291146
{

0 commit comments

Comments
 (0)