Add comment on choice of intrinsic

AntoinePrv · AntoinePrv · commit cce817349213 · 2025-11-13T15:05:33.000-08:00
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1297,8 +1297,8 @@ namespace xsimd
             {
                 constexpr auto lane_mask = mask % make_batch_constant<uint32_t, (mask.size / 2), A>();
                 // Cheaper intrinsics when not crossing lanes
-                // We could also use _mm256_permute_ps which uses a imm8 constant, though it has the
-                // same latency/throughput according to Intel manual.
+                // Contrary to the uint64_t version, the limits of 8 bits for the immediate constant
+                // cannot make different permutations across lanes
                 batch<float, A> permuted = _mm256_permutevar_ps(bitwise_cast<float>(self), lane_mask.as_batch());
                 return bitwise_cast<uint32_t>(permuted);
             }
@@ -1320,7 +1320,7 @@ namespace xsimd
             }
             XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
             {
-                constexpr uint32_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
+                constexpr uint8_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
                 // Cheaper intrinsics when not crossing lanes
                 batch<double, A> permuted = _mm256_permute_pd(bitwise_cast<double>(self), lane_mask);
                 return bitwise_cast<uint64_t>(permuted);

Original file line number	Diff line number	Diff line change
`@@ -1297,8 +1297,8 @@ namespace xsimd`
`1297`	`1297`	`{`
`1298`	`1298`	`constexpr auto lane_mask = mask % make_batch_constant<uint32_t, (mask.size / 2), A>();`
`1299`	`1299`	`// Cheaper intrinsics when not crossing lanes`
`1300`		`- // We could also use _mm256_permute_ps which uses a imm8 constant, though it has the`
`1301`		`- // same latency/throughput according to Intel manual.`
	`1300`	`+ // Contrary to the uint64_t version, the limits of 8 bits for the immediate constant`
	`1301`	`+ // cannot make different permutations across lanes`
`1302`	`1302`	`batch<float, A> permuted = _mm256_permutevar_ps(bitwise_cast<float>(self), lane_mask.as_batch());`
`1303`	`1303`	`return bitwise_cast<uint32_t>(permuted);`
`1304`	`1304`	`}`
`@@ -1320,7 +1320,7 @@ namespace xsimd`
`1320`	`1320`	`}`
`1321`	`1321`	`XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))`
`1322`	`1322`	`{`
`1323`		`- constexpr uint32_t lane_mask = (V0 % 2) \| ((V1 % 2) << 1) \| ((V2 % 2) << 2) \| ((V3 % 2) << 3);`
	`1323`	`+ constexpr uint8_t lane_mask = (V0 % 2) \| ((V1 % 2) << 1) \| ((V2 % 2) << 2) \| ((V3 % 2) << 3);`
`1324`	`1324`	`// Cheaper intrinsics when not crossing lanes`
`1325`	`1325`	`batch<double, A> permuted = _mm256_permute_pd(bitwise_cast<double>(self), lane_mask);`
`1326`	`1326`	`return bitwise_cast<uint64_t>(permuted);`