Skip to content

Commit d889cdb

Browse files
committed
Swap instead of duplicate
1 parent 25f2a71 commit d889cdb

File tree

1 file changed

+43
-57
lines changed

1 file changed

+43
-57
lines changed

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 43 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1629,88 +1629,74 @@ namespace xsimd
16291629
}
16301630
return split;
16311631
}
1632-
// Duplicate lanes separately
1633-
// 1) duplicate low and high lanes
1634-
__m256 low_dup = _mm256_permute2f128_ps(self, self, 0x00); // [low | low]
1635-
__m256 hi_dup = _mm256_permute2f128_ps(self, self, 0x11); // [high| high]
16361632

1637-
// 2) build lane-local index vector (each element = source_index & 3)
1638-
constexpr batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1633+
// Fallback to general algorithm. This is the same as the dynamic version with the exception
1634+
// that possible operations are done at compile time.
16391635

1640-
__m256 r0 = _mm256_permutevar_ps(low_dup, half_mask.as_batch()); // pick from low lane
1641-
__m256 r1 = _mm256_permutevar_ps(hi_dup, half_mask.as_batch()); // pick from high lane
1636+
// swap lanes
1637+
__m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low]
1638+
1639+
// normalize mask taking modulo 4
1640+
constexpr auto half_mask = mask % make_batch_constant<uint32_t, 4, A>();
1641+
1642+
// permute within each lane
1643+
__m256 r0 = _mm256_permutevar_ps(self, half_mask.as_batch());
1644+
__m256 r1 = _mm256_permutevar_ps(swapped, half_mask.as_batch());
16421645

1643-
constexpr batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> lane_mask {};
1646+
// select lane by the mask index divided by 4
1647+
constexpr auto lane = batch_constant<uint32_t, A, 0, 0, 0, 0, 1, 1, 1, 1> {};
1648+
constexpr int lane_mask = ((mask / make_batch_constant<uint32_t, 4, A>()) != lane).mask();
16441649

1645-
return _mm256_blend_ps(r0, r1, lane_mask.mask());
1650+
return _mm256_blend_ps(r0, r1, lane_mask);
16461651
}
16471652

16481653
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
16491654
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
16501655
{
16511656
// cannot use detail::mod_shuffle as the mod and shift are different in this case
1652-
constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3);
1657+
constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3);
16531658
XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; }
16541659
XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask))
16551660
{
16561661
return _mm256_permute_pd(self, imm);
16571662
}
1658-
// duplicate low and high part of input
1659-
__m256d lo = _mm256_permute2f128_pd(self, self, 0x00);
1660-
__m256d hi = _mm256_permute2f128_pd(self, self, 0x11);
1663+
1664+
// Fallback to general algorithm. This is the same as the dynamic version with the exception
1665+
// that possible operations are done at compile time.
1666+
1667+
// swap lanes
1668+
__m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low]
16611669

16621670
// permute within each lane
1663-
__m256d r0 = _mm256_permute_pd(lo, imm);
1664-
__m256d r1 = _mm256_permute_pd(hi, imm);
1671+
__m256d r0 = _mm256_permute_pd(self, imm);
1672+
__m256d r1 = _mm256_permute_pd(swapped, imm);
16651673

1666-
// mask to choose the right lane
1667-
constexpr batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1674+
// select lane by the mask index divided by 2
1675+
constexpr auto lane = batch_constant<uint64_t, A, 0, 0, 1, 1> {};
1676+
constexpr int lane_mask = ((mask / make_batch_constant<uint64_t, 2, A>()) != lane).mask();
16681677

16691678
// blend the two permutes
1670-
return _mm256_blend_pd(r0, r1, blend_mask.mask());
1671-
}
1672-
template <class A,
1673-
typename T,
1674-
uint32_t V0,
1675-
uint32_t V1,
1676-
uint32_t V2,
1677-
uint32_t V3,
1678-
uint32_t V4,
1679-
uint32_t V5,
1680-
uint32_t V6,
1681-
uint32_t V7,
1682-
detail::enable_sized_integral_t<T, 4> = 0>
1683-
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
1684-
batch_constant<uint32_t, A,
1685-
V0,
1686-
V1,
1687-
V2,
1688-
V3,
1689-
V4,
1690-
V5,
1691-
V6,
1692-
V7> const& mask,
1693-
requires_arch<avx>) noexcept
1679+
return _mm256_blend_pd(r0, r1, lane_mask);
1680+
}
1681+
1682+
template <
1683+
class A, typename T,
1684+
uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
1685+
detail::enable_sized_integral_t<T, 4> = 0>
1686+
XSIMD_INLINE batch<T, A> swizzle(
1687+
batch<T, A> const& self,
1688+
batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> const& mask,
1689+
requires_arch<avx>) noexcept
16941690
{
1695-
return bitwise_cast<T>(
1696-
swizzle(bitwise_cast<float>(self), mask));
1691+
return bitwise_cast<T>(swizzle(bitwise_cast<float>(self), mask));
16971692
}
16981693

1699-
template <class A,
1700-
typename T,
1701-
uint64_t V0,
1702-
uint64_t V1,
1703-
uint64_t V2,
1704-
uint64_t V3,
1705-
detail::enable_sized_integral_t<T, 8> = 0>
1706-
XSIMD_INLINE batch<T, A>
1707-
swizzle(batch<T, A> const& self,
1708-
batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
1709-
requires_arch<avx>) noexcept
1694+
template <class A, typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_integral_t<T, 8> = 0>
1695+
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask, requires_arch<avx>) noexcept
17101696
{
1711-
return bitwise_cast<T>(
1712-
swizzle(bitwise_cast<double>(self), mask));
1697+
return bitwise_cast<T>(swizzle(bitwise_cast<double>(self), mask));
17131698
}
1699+
17141700
// transpose
17151701
template <class A>
17161702
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept

0 commit comments

Comments
 (0)