Skip to content

Commit c847672

Browse files
committed
Improve avx2 uint32_t swizzle
1 parent 9345138 commit c847672

File tree

1 file changed

+19
-16
lines changed

1 file changed

+19
-16
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,15 +1287,28 @@ namespace xsimd
12871287
}
12881288

12891289
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1290-
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1290+
XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
12911291
{
1292-
XSIMD_IF_CONSTEXPR(detail::is_all_different(mask) && !detail::is_identity(mask))
1292+
XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
12931293
{
1294-
// The intrinsic does NOT allow to copy the same element of the source vector to more than one element of the destination vector.
1295-
// one-shot 8-lane permute
1296-
return _mm256_permutevar8x32_ps(self, mask.as_batch());
1294+
return self;
12971295
}
1298-
return swizzle(self, mask, avx {});
1296+
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
1297+
{
1298+
constexpr auto lane_mask = mask % make_batch_constant<uint32_t, (mask.size / 2), A>();
1299+
// Cheaper intrinsics when not crossing lanes
1300+
// We could also use _mm256_permute_ps which uses a imm8 constant, though it has the
1301+
// same latency/throughput according to Intel manual.
1302+
batch<float, A> permuted = _mm256_permutevar_ps(bitwise_cast<float>(self), lane_mask.as_batch());
1303+
return bitwise_cast<uint32_t>(permuted);
1304+
}
1305+
return _mm256_permutevar8x32_epi32(self, mask.as_batch());
1306+
}
1307+
1308+
template <class A, typename T, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7, detail::enable_sized_t<T, 4> = 0>
1309+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2> req) noexcept
1310+
{
1311+
return bitwise_cast<T>(swizzle(bitwise_cast<uint32_t>(self), mask, req));
12991312
}
13001313

13011314
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
@@ -1323,16 +1336,6 @@ namespace xsimd
13231336
{
13241337
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
13251338
}
1326-
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1327-
XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1328-
{
1329-
return _mm256_permutevar8x32_epi32(self, mask.as_batch());
1330-
}
1331-
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1332-
XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
1333-
{
1334-
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
1335-
}
13361339

13371340
// zip_hi
13381341
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>

0 commit comments

Comments
 (0)