Skip to content

Commit be3e8bd

Browse files
committed
Add broadcast optimization
1 parent d889cdb commit be3e8bd

File tree

1 file changed

+26
-1
lines changed

1 file changed

+26
-1
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1629,6 +1629,17 @@ namespace xsimd
16291629
}
16301630
return split;
16311631
}
1632+
constexpr auto lane_mask = mask % make_batch_constant<uint32_t, (mask.size / 2), A>();
1633+
XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask))
1634+
{
1635+
__m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low]
1636+
return _mm256_permutevar_ps(broadcast, lane_mask.as_batch());
1637+
}
1638+
XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask))
1639+
{
1640+
__m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high]
1641+
return _mm256_permutevar_ps(broadcast, lane_mask.as_batch());
1642+
}
16321643

16331644
// Fallback to general algorithm. This is the same as the dynamic version with the exception
16341645
// that possible operations are done at compile time.
@@ -1655,11 +1666,25 @@ namespace xsimd
16551666
{
16561667
// cannot use detail::mod_shuffle as the mod and shift are different in this case
16571668
constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
1658-
XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
1669+
XSIMD_IF_CONSTEXPR(detail::is_identity(mask))
1670+
{
1671+
return self;
1672+
}
16591673
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
16601674
{
16611675
return _mm256_permute_pd(self, imm);
16621676
}
1677+
constexpr auto lane_mask = mask % make_batch_constant<uint64_t, (mask.size / 2), A>();
1678+
XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask))
1679+
{
1680+
__m256i broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low]
1681+
return _mm256_permute_pd(broadcast, lane_mask.as_batch());
1682+
}
1683+
XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask))
1684+
{
1685+
__m256i broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high]
1686+
return _mm256_permute_pd(broadcast, lane_mask.as_batch());
1687+
}
16631688

16641689
// Fallback to general algorithm. This is the same as the dynamic version with the exception
16651690
// that possible operations are done at compile time.

0 commit comments

Comments
 (0)