@@ -1096,6 +1096,34 @@ namespace xsimd
10961096 }
10971097
10981098 // swizzle (dynamic mask)
1099+ template <class A >
1100+ XSIMD_INLINE batch<uint8_t , A> swizzle (batch<uint8_t , A> const & self, batch<uint8_t , A> mask, requires_arch<avx2>) noexcept
1101+ {
1102+ // swap lanes
1103+ __m256i swapped = _mm256_permute2x128_si256 (self, self, 0x01 ); // [high | low]
1104+
1105+ // normalize mask taking modulo 16
1106+ batch<uint8_t , A> half_mask = mask & 0b1111u ;
1107+
1108+ // permute bytes within each lane (AVX2 only)
1109+ __m256i r0 = _mm256_shuffle_epi8 (self, half_mask);
1110+ __m256i r1 = _mm256_shuffle_epi8 (swapped, half_mask);
1111+
1112+ // select lane by the mask index divided by 16
1113+ constexpr auto lane = batch_constant<
1114+ uint8_t , A,
1115+ 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 , 00 ,
1116+ 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 , 16 > {};
1117+ batch_bool<uint8_t , A> blend_mask = (mask & 0b10000u ) != lane;
1118+ return _mm256_blendv_epi8 (r0, r1, blend_mask);
1119+ }
1120+
1121+ template <class A , typename T, detail::enable_sized_t <T, 1 > = 0 >
1122+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch<uint8_t , A> const & mask, requires_arch<avx>) noexcept
1123+ {
1124+ return bitwise_cast<T>(swizzle (bitwise_cast<uint8_t >(self), mask));
1125+ }
1126+
10991127 template <class A >
11001128 XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch<uint32_t , A> mask, requires_arch<avx2>) noexcept
11011129 {
0 commit comments