@@ -1287,15 +1287,28 @@ namespace xsimd
12871287 }
12881288
12891289 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1290- XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1290+ XSIMD_INLINE batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
12911291 {
1292- XSIMD_IF_CONSTEXPR (detail::is_all_different (mask) && ! detail:: is_identity (mask))
1292+ XSIMD_IF_CONSTEXPR (detail::is_identity (mask))
12931293 {
1294- // The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector.
1295- // one-shot 8-lane permute
1296- return _mm256_permutevar8x32_ps (self, mask.as_batch ());
1294+ return self;
12971295 }
1298- return swizzle (self, mask, avx {});
1296+ XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
1297+ {
1298+ constexpr auto lane_mask = mask % make_batch_constant<uint32_t , (mask.size / 2 ), A>();
1299+ // Cheaper intrinsics when not crossing lanes
1300+ // We could also use _mm256_permute_ps which uses a imm8 constant, though it has the
1301+ // same latency/throughput according to Intel manual.
1302+ batch<float , A> permuted = _mm256_permutevar_ps (bitwise_cast<float >(self), lane_mask.as_batch ());
1303+ return bitwise_cast<uint32_t >(permuted);
1304+ }
1305+ return _mm256_permutevar8x32_epi32 (self, mask.as_batch ());
1306+ }
1307+
1308+ template <class A , typename T, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7, detail::enable_sized_t <T, 4 > = 0 >
1309+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2> req) noexcept
1310+ {
1311+ return bitwise_cast<T>(swizzle (bitwise_cast<uint32_t >(self), mask, req));
12991312 }
13001313
13011314 template <class A , uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
@@ -1323,16 +1336,6 @@ namespace xsimd
13231336 {
13241337 return bitwise_cast<int64_t >(swizzle (bitwise_cast<uint64_t >(self), mask, avx2 {}));
13251338 }
1326- template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1327- XSIMD_INLINE batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1328- {
1329- return _mm256_permutevar8x32_epi32 (self, mask.as_batch ());
1330- }
1331- template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1332- XSIMD_INLINE batch<int32_t , A> swizzle (batch<int32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1333- {
1334- return bitwise_cast<int32_t >(swizzle (bitwise_cast<uint32_t >(self), mask, avx2 {}));
1335- }
13361339
13371340 // zip_hi
13381341 template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value>::type>
0 commit comments