Skip to content

Commit 359ebb2

Browse files
committed
improved sse2 swizzle
# Conflicts: # include/xsimd/arch/xsimd_sse2.hpp
1 parent b2af082 commit 359ebb2

File tree

1 file changed

+62
-33
lines changed

1 file changed

+62
-33
lines changed

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 62 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,53 +1688,83 @@ namespace xsimd
16881688
}
16891689

16901690
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1691-
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1691+
XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
16921692
{
1693-
__m128i v = self;
1694-
1693+
// 0) identity?
1694+
constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7);
1695+
XSIMD_IF_CONSTEXPR(is_identity)
1696+
{
1697+
return self;
1698+
}
1699+
// 1) duplicate‐low‐half? (lanes 0–3 from low half, and 4–7 the same)
1700+
constexpr bool is_dup_lo = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4) && V4 == V0 && V5 == V1 && V6 == V2 && V7 == V3;
1701+
XSIMD_IF_CONSTEXPR(is_dup_lo)
1702+
{
1703+
// permute the low half
1704+
constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3);
1705+
const auto lo = _mm_shufflelo_epi16(self, imm);
1706+
// broadcast that 64-bit low half into both halves
1707+
const auto lo_all = _mm_unpacklo_epi64(lo, lo);
1708+
return lo_all;
1709+
}
1710+
// 2) duplicate‐high‐half? (lanes 0–3 from high half, 4–7 the same)
1711+
constexpr bool is_dup_hi = (V0 >= 4 && V0 < 8 && V1 >= 4 && V1 < 8 && V2 >= 4 && V2 < 8 && V3 >= 4 && V3 < 8) && V4 == V0 && V5 == V1 && V6 == V2 && V7 == V3;
1712+
XSIMD_IF_CONSTEXPR(is_dup_hi)
1713+
{
1714+
// permute the high half (indices %4)
1715+
constexpr int imm = detail::mod_shuffle(V0 - 4, V1 - 4, V2 - 4, V3 - 4);
1716+
const auto hi = _mm_shufflehi_epi16(self, imm);
1717+
// broadcast that 64-bit high half into both halves
1718+
const auto hi_all = _mm_unpackhi_epi64(hi, hi);
1719+
return hi_all;
1720+
}
16951721
// 1) Shuffle the low 64-bit half for lanes 0–3 and 4–7:
16961722
constexpr int imm_lo0 = detail::mod_shuffle(V0, V1, V2, V3);
16971723
constexpr int imm_lo1 = detail::mod_shuffle(V4, V5, V6, V7);
1698-
__m128i lo0 = _mm_shufflelo_epi16(v, imm_lo0);
1699-
__m128i lo1 = _mm_shufflelo_epi16(v, imm_lo1);
1700-
1724+
const auto lo0 = _mm_shufflelo_epi16(self, imm_lo0);
1725+
const auto lo1 = _mm_shufflelo_epi16(self, imm_lo1);
17011726
// Broadcast each low-half permutation across both 64-bit halves:
1702-
__m128i lo0_all = _mm_unpacklo_epi64(lo0, lo0);
1703-
__m128i lo1_all = _mm_unpacklo_epi64(lo1, lo1);
1704-
1727+
const auto lo0_all = _mm_unpacklo_epi64(lo0, lo0);
1728+
const auto lo1_all = _mm_unpacklo_epi64(lo1, lo1);
17051729
// 2) Shuffle the high 64-bit half for lanes 0–3 and 4–7:
1706-
constexpr int imm_hi0 = detail::mod_shuffle(V0 - 4, V1 - 4, V2 - 4, V3 - 4);
1707-
constexpr int imm_hi1 = detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
1708-
__m128i hi0 = _mm_shufflehi_epi16(v, imm_hi0);
1709-
__m128i hi1 = _mm_shufflehi_epi16(v, imm_hi1);
17101730

17111731
// Broadcast each high-half permutation across both 64-bit halves:
1712-
__m128i hi0_all = _mm_unpackhi_epi64(hi0, hi0);
1713-
__m128i hi1_all = _mm_unpackhi_epi64(hi1, hi1);
1732+
// hi0_all: only instantiated if any of V0..V3 >= 4
1733+
const auto hi0_all = [&]
1734+
{
1735+
XSIMD_IF_CONSTEXPR(!(V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4))
1736+
{
1737+
constexpr int imm_hi0 = detail::mod_shuffle(V0 - 4, V1 - 4, V2 - 4, V3 - 4);
1738+
__m128i hi0 = _mm_shufflehi_epi16(self, imm_hi0);
1739+
return _mm_unpackhi_epi64(hi0, hi0);
1740+
}
1741+
// not used whenever all V0..V3<4
1742+
return _mm_setzero_si128();
1743+
}();
17141744

1745+
// hi1_all: only instantiated if any of V4..V7 >= 4
1746+
const auto hi1_all = [&]
1747+
{
1748+
XSIMD_IF_CONSTEXPR(!(V4 < 4 && V5 < 4 && V6 < 4 && V7 < 4))
1749+
{
1750+
constexpr int imm_hi1 = detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4);
1751+
__m128i hi1 = _mm_shufflehi_epi16(self, imm_hi1);
1752+
return _mm_unpackhi_epi64(hi1, hi1);
1753+
}
1754+
return _mm_setzero_si128();
1755+
}();
17151756
// 3) Merge the two “low” broadcasts into one vector (lanes 0–3 ← lo0_all, lanes 4–7 ← lo1_all)
1716-
__m128i low_all = _mm_unpacklo_epi64(lo0_all, lo1_all); // { lo0, lo1 }
1717-
1718-
// constexpr batch_bool_constant<uint16_t, A, false, false, false, false, true, true, true, true> group_mask {};
1719-
// auto low_all = select(group_mask, batch<uint16_t, A>(lo1_all), batch<uint16_t, A>(lo0_all));
1720-
1757+
const auto low_all = _mm_unpacklo_epi64(lo0_all, lo1_all); // { lo0, lo1 }
17211758
// Likewise merge the two “high” broadcasts:
1722-
__m128i high_all = _mm_unpacklo_epi64(hi0_all, hi1_all); // { hi0, hi1 }
1723-
1724-
// auto high_all = select(group_mask, batch<uint16_t, A>(hi1_all), batch<uint16_t, A>(hi0_all));
1725-
1759+
const auto high_all = _mm_unpacklo_epi64(hi0_all, hi1_all); // { hi0, hi1 }
17261760
// 4) Finally, pick per-lane: if Vn<4 → take from low_all, else from high_all
1727-
constexpr batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> lane_mask {};
1728-
return select(lane_mask, // mask[i] ? low_all[i] : high_all[i]
1729-
batch<uint16_t, A>(low_all),
1730-
batch<uint16_t, A>(high_all));
1731-
// return select(lane_mask, low_all, high_all);
1761+
constexpr batch_bool_constant<int16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> lane_mask {};
1762+
return select(lane_mask, batch<int16_t>(low_all), batch<int16_t>(high_all));
17321763
}
1733-
17341764
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1735-
XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1765+
XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
17361766
{
1737-
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, sse2 {}));
1767+
return bitwise_cast<uint16_t>(swizzle(bitwise_cast<int16_t>(self), mask, sse2 {}));
17381768
}
17391769

17401770
// transpose
@@ -1854,7 +1884,6 @@ namespace xsimd
18541884
{
18551885
return _mm_unpacklo_pd(self, other);
18561886
}
1857-
18581887
}
18591888
}
18601889

0 commit comments

Comments
 (0)