Skip to content

Commit 809f983

Browse files
committed
Improve avx2 uint64_t swizzle
1 parent c847672 commit 809f983

File tree

1 file changed

+20
-16
lines changed

1 file changed

+20
-16
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,29 +1312,33 @@ namespace xsimd
13121312
}
13131313

13141314
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1315-
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
1315+
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
13161316
{
1317-
XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
1317+
XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
1318+
{
1319+
return self;
1320+
}
13181321
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
13191322
{
1320-
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1321-
return _mm256_permute_pd(self, imm);
1323+
// The lane mask value is found in mask modulo 2, but the intrinsic expect it in the
1324+
// second least significant bit.
1325+
constexpr auto two = make_batch_constant<uint64_t, 2, A>();
1326+
constexpr auto half_size = make_batch_constant<uint64_t, (mask.size / 2), A>();
1327+
constexpr auto lane_mask = (mask % half_size) * two; // `* two` for `<< one`
1328+
// Cheaper intrinsics when not crossing lanes
1329+
// We could also use _mm256_permute_pd which uses a imm8 constant, though it has the
1330+
// same latency/throughput according to Intel manual.
1331+
batch<double, A> permuted = _mm256_permutevar_pd(bitwise_cast<double>(self), lane_mask.as_batch());
1332+
return bitwise_cast<uint64_t>(permuted);
13221333
}
1323-
constexpr auto imm = detail::mod_shuffle(V0, V1, V2, V3);
1324-
// fallback to full 4-element permute
1325-
return _mm256_permute4x64_pd(self, imm);
1334+
constexpr auto mask_int = detail::mod_shuffle(V0, V1, V2, V3);
1335+
return _mm256_permute4x64_epi64(self, mask_int);
13261336
}
13271337

1328-
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1329-
XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
1330-
{
1331-
constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
1332-
return _mm256_permute4x64_epi64(self, mask);
1333-
}
1334-
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1335-
XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
1338+
template <class A, typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_t<T, 8> = 0>
1339+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2> req) noexcept
13361340
{
1337-
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
1341+
return bitwise_cast<T>(swizzle(bitwise_cast<uint64_t>(self), mask, req));
13381342
}
13391343

13401344
// zip_hi

0 commit comments

Comments
 (0)