Add dynamic swizzle for uin16_t on avx2

AntoinePrv · AntoinePrv · commit ca01f17d0d31 · 2025-11-10T10:39:55.000-08:00
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1119,11 +1119,28 @@ namespace xsimd
         }
 
         template <class A, typename T, detail::enable_sized_t<T, 1> = 0>
-        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx>) noexcept
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), mask));
         }
 
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(
+            batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx2>) noexcept
+        {
+            // No blend/shuffle for 16 bits, we need to use the 8 bits version
+            const auto self_bytes = bitwise_cast<uint8_t>(self);
+            // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte
+            const auto mask_2k_2kp1 = bitwise_cast<uint8_t>((mask << 1) | (mask << 9) | 0x100);
+            return bitwise_cast<uint16_t>(swizzle(self_bytes, mask_2k_2kp1, requires_arch<avx2>));
+        }
+
+        template <class A, typename T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint16_t, A> const& mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<T>(swizzle(bitwise_cast<uint16_t>(self), mask));
+        }
+
         template <class A>
         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
         {