@@ -1312,29 +1312,33 @@ namespace xsimd
13121312 }
13131313
13141314 template <class A , uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1315- XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
1315+ XSIMD_INLINE batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
13161316 {
1317- XSIMD_IF_CONSTEXPR (detail::is_identity (mask)) { return self; }
1317+ XSIMD_IF_CONSTEXPR (detail::is_identity (mask))
1318+ {
1319+ return self;
1320+ }
13181321 XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
13191322 {
1320- constexpr auto imm = ((V0 & 1 ) << 0 ) | ((V1 & 1 ) << 1 ) | ((V2 & 1 ) << 2 ) | ((V3 & 1 ) << 3 );
1321- return _mm256_permute_pd (self, imm);
1323+ // The lane mask value is found in mask modulo 2, but the intrinsic expect it in the
1324+ // second least significant bit.
1325+ constexpr auto two = make_batch_constant<uint64_t , 2 , A>();
1326+ constexpr auto half_size = make_batch_constant<uint64_t , (mask.size / 2 ), A>();
1327+ constexpr auto lane_mask = (mask % half_size) * two; // `* two` for `<< one`
1328+ // Cheaper intrinsics when not crossing lanes
1329+ // We could also use _mm256_permute_pd which uses a imm8 constant, though it has the
1330+ // same latency/throughput according to Intel manual.
1331+ batch<double , A> permuted = _mm256_permutevar_pd (bitwise_cast<double >(self), lane_mask.as_batch ());
1332+ return bitwise_cast<uint64_t >(permuted);
13221333 }
1323- constexpr auto imm = detail::mod_shuffle (V0, V1, V2, V3);
1324- // fallback to full 4-element permute
1325- return _mm256_permute4x64_pd (self, imm);
1334+ constexpr auto mask_int = detail::mod_shuffle (V0, V1, V2, V3);
1335+ return _mm256_permute4x64_epi64 (self, mask_int);
13261336 }
13271337
1328- template <class A , uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1329- XSIMD_INLINE batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
1330- {
1331- constexpr auto mask = detail::mod_shuffle (V0, V1, V2, V3);
1332- return _mm256_permute4x64_epi64 (self, mask);
1333- }
1334- template <class A , uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1335- XSIMD_INLINE batch<int64_t , A> swizzle (batch<int64_t , A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
1338+ template <class A , typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_t <T, 8 > = 0 >
1339+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> mask, requires_arch<avx2> req) noexcept
13361340 {
1337- return bitwise_cast<int64_t >(swizzle (bitwise_cast<uint64_t >(self), mask, avx2 {} ));
1341+ return bitwise_cast<T >(swizzle (bitwise_cast<uint64_t >(self), mask, req ));
13381342 }
13391343
13401344 // zip_hi
0 commit comments