File tree Expand file tree Collapse file tree 1 file changed +3
-3
lines changed
Expand file tree Collapse file tree 1 file changed +3
-3
lines changed Original file line number Diff line number Diff line change @@ -1297,8 +1297,8 @@ namespace xsimd
12971297 {
12981298 constexpr auto lane_mask = mask % make_batch_constant<uint32_t , (mask.size / 2 ), A>();
12991299 // Cheaper intrinsics when not crossing lanes
1300- // We could also use _mm256_permute_ps which uses a imm8 constant, though it has the
1301- // same latency/throughput according to Intel manual.
1300+ // Contrary to the uint64_t version, the limits of 8 bits for the immediate constant
1301+ // cannot make different permutations across lanes
13021302 batch<float , A> permuted = _mm256_permutevar_ps (bitwise_cast<float >(self), lane_mask.as_batch ());
13031303 return bitwise_cast<uint32_t >(permuted);
13041304 }
@@ -1320,7 +1320,7 @@ namespace xsimd
13201320 }
13211321 XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
13221322 {
1323- constexpr uint32_t lane_mask = (V0 % 2 ) | ((V1 % 2 ) << 1 ) | ((V2 % 2 ) << 2 ) | ((V3 % 2 ) << 3 );
1323+ constexpr uint8_t lane_mask = (V0 % 2 ) | ((V1 % 2 ) << 1 ) | ((V2 % 2 ) << 2 ) | ((V3 % 2 ) << 3 );
13241324 // Cheaper intrinsics when not crossing lanes
13251325 batch<double , A> permuted = _mm256_permute_pd (bitwise_cast<double >(self), lane_mask);
13261326 return bitwise_cast<uint64_t >(permuted);
You can’t perform that action at this time.
0 commit comments