Skip to content

Commit cce8173

Browse files
committed
Add comment on choice of intrinsic
1 parent be06734 commit cce8173

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,8 +1297,8 @@ namespace xsimd
12971297
{
12981298
constexpr auto lane_mask = mask % make_batch_constant<uint32_t, (mask.size / 2), A>();
12991299
// Cheaper intrinsics when not crossing lanes
1300-
// We could also use _mm256_permute_ps which uses a imm8 constant, though it has the
1301-
// same latency/throughput according to Intel manual.
1300+
// Contrary to the uint64_t version, the limits of 8 bits for the immediate constant
1301+
// cannot make different permutations across lanes
13021302
batch<float, A> permuted = _mm256_permutevar_ps(bitwise_cast<float>(self), lane_mask.as_batch());
13031303
return bitwise_cast<uint32_t>(permuted);
13041304
}
@@ -1320,7 +1320,7 @@ namespace xsimd
13201320
}
13211321
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
13221322
{
1323-
constexpr uint32_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
1323+
constexpr uint8_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
13241324
// Cheaper intrinsics when not crossing lanes
13251325
batch<double, A> permuted = _mm256_permute_pd(bitwise_cast<double>(self), lane_mask);
13261326
return bitwise_cast<uint64_t>(permuted);

0 commit comments

Comments
 (0)