@@ -1119,11 +1119,28 @@ namespace xsimd
11191119 }
11201120
11211121 template <class A , typename T, detail::enable_sized_t <T, 1 > = 0 >
1122- XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch<uint8_t , A> const & mask, requires_arch<avx >) noexcept
1122+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch<uint8_t , A> const & mask, requires_arch<avx2 >) noexcept
11231123 {
11241124 return bitwise_cast<T>(swizzle (bitwise_cast<uint8_t >(self), mask));
11251125 }
11261126
1127+ template <class A >
1128+ XSIMD_INLINE batch<uint16_t , A> swizzle (
1129+ batch<uint16_t , A> const & self, batch<uint16_t , A> mask, requires_arch<avx2>) noexcept
1130+ {
1131+ // No blend/shuffle for 16 bits, we need to use the 8 bits version
1132+ const auto self_bytes = bitwise_cast<uint8_t >(self);
1133+ // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte
1134+ const auto mask_2k_2kp1 = bitwise_cast<uint8_t >((mask << 1 ) | (mask << 9 ) | 0x100 );
1135+ return bitwise_cast<uint16_t >(swizzle (self_bytes, mask_2k_2kp1, requires_arch<avx2>));
1136+ }
1137+
1138+ template <class A , typename T, detail::enable_sized_t <T, 2 > = 0 >
1139+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch<uint16_t , A> const & mask, requires_arch<avx2>) noexcept
1140+ {
1141+ return bitwise_cast<T>(swizzle (bitwise_cast<uint16_t >(self), mask));
1142+ }
1143+
11271144 template <class A >
11281145 XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch<uint32_t , A> mask, requires_arch<avx2>) noexcept
11291146 {
0 commit comments