Skip to content

Commit ebc033e

Browse files
committed
Use immediate constant
1 parent d99d88e commit ebc033e

File tree

1 file changed

+2
-8
lines changed

1 file changed

+2
-8
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1320,15 +1320,9 @@ namespace xsimd
13201320
}
13211321
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
13221322
{
1323-
// The lane mask value is found in mask modulo 2, but the intrinsic expect it in the
1324-
// second least significant bit.
1325-
constexpr auto two = make_batch_constant<uint64_t, 2, A>();
1326-
constexpr auto half_size = make_batch_constant<uint64_t, (mask.size / 2), A>();
1327-
constexpr auto lane_mask = (mask % half_size) * two; // `* two` for `<< one`
1323+
constexpr uint8_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
13281324
// Cheaper intrinsics when not crossing lanes
1329-
// We could also use _mm256_permute_pd which uses a imm8 constant, though it has the
1330-
// same latency/throughput according to Intel manual.
1331-
batch<double, A> permuted = _mm256_permutevar_pd(bitwise_cast<double>(self), lane_mask.as_batch());
1325+
batch<double, A> permuted = _mm256_permute_pd(bitwise_cast<double>(self), lane_mask);
13321326
return bitwise_cast<uint64_t>(permuted);
13331327
}
13341328
constexpr auto mask_int = detail::mod_shuffle(V0, V1, V2, V3);

0 commit comments

Comments
 (0)