@@ -1688,53 +1688,83 @@ namespace xsimd
16881688 }
16891689
16901690 template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1691- XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
1691+ XSIMD_INLINE batch<int16_t , A> swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
16921692 {
1693- __m128i v = self;
1694-
1693+ // 0) identity?
1694+ constexpr bool is_identity = (V0 == 0 && V1 == 1 && V2 == 2 && V3 == 3 && V4 == 4 && V5 == 5 && V6 == 6 && V7 == 7 );
1695+ XSIMD_IF_CONSTEXPR (is_identity)
1696+ {
1697+ return self;
1698+ }
1699+ // 1) duplicate‐low‐half? (lanes 0–3 from low half, and 4–7 the same)
1700+ constexpr bool is_dup_lo = (V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4 ) && V4 == V0 && V5 == V1 && V6 == V2 && V7 == V3;
1701+ XSIMD_IF_CONSTEXPR (is_dup_lo)
1702+ {
1703+ // permute the low half
1704+ constexpr int imm = detail::mod_shuffle (V0, V1, V2, V3);
1705+ const auto lo = _mm_shufflelo_epi16 (self, imm);
1706+ // broadcast that 64-bit low half into both halves
1707+ const auto lo_all = _mm_unpacklo_epi64 (lo, lo);
1708+ return lo_all;
1709+ }
1710+ // 2) duplicate‐high‐half? (lanes 0–3 from high half, 4–7 the same)
1711+ constexpr bool is_dup_hi = (V0 >= 4 && V0 < 8 && V1 >= 4 && V1 < 8 && V2 >= 4 && V2 < 8 && V3 >= 4 && V3 < 8 ) && V4 == V0 && V5 == V1 && V6 == V2 && V7 == V3;
1712+ XSIMD_IF_CONSTEXPR (is_dup_hi)
1713+ {
1714+ // permute the high half (indices %4)
1715+ constexpr int imm = detail::mod_shuffle (V0 - 4 , V1 - 4 , V2 - 4 , V3 - 4 );
1716+ const auto hi = _mm_shufflehi_epi16 (self, imm);
1717+ // broadcast that 64-bit high half into both halves
1718+ const auto hi_all = _mm_unpackhi_epi64 (hi, hi);
1719+ return hi_all;
1720+ }
16951721 // 1) Shuffle the low 64-bit half for lanes 0–3 and 4–7:
16961722 constexpr int imm_lo0 = detail::mod_shuffle (V0, V1, V2, V3);
16971723 constexpr int imm_lo1 = detail::mod_shuffle (V4, V5, V6, V7);
1698- __m128i lo0 = _mm_shufflelo_epi16 (v, imm_lo0);
1699- __m128i lo1 = _mm_shufflelo_epi16 (v, imm_lo1);
1700-
1724+ const auto lo0 = _mm_shufflelo_epi16 (self, imm_lo0);
1725+ const auto lo1 = _mm_shufflelo_epi16 (self, imm_lo1);
17011726 // Broadcast each low-half permutation across both 64-bit halves:
1702- __m128i lo0_all = _mm_unpacklo_epi64 (lo0, lo0);
1703- __m128i lo1_all = _mm_unpacklo_epi64 (lo1, lo1);
1704-
1727+ const auto lo0_all = _mm_unpacklo_epi64 (lo0, lo0);
1728+ const auto lo1_all = _mm_unpacklo_epi64 (lo1, lo1);
17051729 // 2) Shuffle the high 64-bit half for lanes 0–3 and 4–7:
1706- constexpr int imm_hi0 = detail::mod_shuffle (V0 - 4 , V1 - 4 , V2 - 4 , V3 - 4 );
1707- constexpr int imm_hi1 = detail::mod_shuffle (V4 - 4 , V5 - 4 , V6 - 4 , V7 - 4 );
1708- __m128i hi0 = _mm_shufflehi_epi16 (v, imm_hi0);
1709- __m128i hi1 = _mm_shufflehi_epi16 (v, imm_hi1);
17101730
17111731 // Broadcast each high-half permutation across both 64-bit halves:
1712- __m128i hi0_all = _mm_unpackhi_epi64 (hi0, hi0);
1713- __m128i hi1_all = _mm_unpackhi_epi64 (hi1, hi1);
1732+ // hi0_all: only instantiated if any of V0..V3 >= 4
1733+ const auto hi0_all = [&]
1734+ {
1735+ XSIMD_IF_CONSTEXPR (!(V0 < 4 && V1 < 4 && V2 < 4 && V3 < 4 ))
1736+ {
1737+ constexpr int imm_hi0 = detail::mod_shuffle (V0 - 4 , V1 - 4 , V2 - 4 , V3 - 4 );
1738+ __m128i hi0 = _mm_shufflehi_epi16 (self, imm_hi0);
1739+ return _mm_unpackhi_epi64 (hi0, hi0);
1740+ }
1741+ // not used whenever all V0..V3<4
1742+ return _mm_setzero_si128 ();
1743+ }();
17141744
1745+ // hi1_all: only instantiated if any of V4..V7 >= 4
1746+ const auto hi1_all = [&]
1747+ {
1748+ XSIMD_IF_CONSTEXPR (!(V4 < 4 && V5 < 4 && V6 < 4 && V7 < 4 ))
1749+ {
1750+ constexpr int imm_hi1 = detail::mod_shuffle (V4 - 4 , V5 - 4 , V6 - 4 , V7 - 4 );
1751+ __m128i hi1 = _mm_shufflehi_epi16 (self, imm_hi1);
1752+ return _mm_unpackhi_epi64 (hi1, hi1);
1753+ }
1754+ return _mm_setzero_si128 ();
1755+ }();
17151756 // 3) Merge the two “low” broadcasts into one vector (lanes 0–3 ← lo0_all, lanes 4–7 ← lo1_all)
1716- __m128i low_all = _mm_unpacklo_epi64 (lo0_all, lo1_all); // { lo0, lo1 }
1717-
1718- // constexpr batch_bool_constant<uint16_t, A, false, false, false, false, true, true, true, true> group_mask {};
1719- // auto low_all = select(group_mask, batch<uint16_t, A>(lo1_all), batch<uint16_t, A>(lo0_all));
1720-
1757+ const auto low_all = _mm_unpacklo_epi64 (lo0_all, lo1_all); // { lo0, lo1 }
17211758 // Likewise merge the two “high” broadcasts:
1722- __m128i high_all = _mm_unpacklo_epi64 (hi0_all, hi1_all); // { hi0, hi1 }
1723-
1724- // auto high_all = select(group_mask, batch<uint16_t, A>(hi1_all), batch<uint16_t, A>(hi0_all));
1725-
1759+ const auto high_all = _mm_unpacklo_epi64 (hi0_all, hi1_all); // { hi0, hi1 }
17261760 // 4) Finally, pick per-lane: if Vn<4 → take from low_all, else from high_all
1727- constexpr batch_bool_constant<uint16_t , A, (V0 < 4 ), (V1 < 4 ), (V2 < 4 ), (V3 < 4 ), (V4 < 4 ), (V5 < 4 ), (V6 < 4 ), (V7 < 4 )> lane_mask {};
1728- return select (lane_mask, // mask[i] ? low_all[i] : high_all[i]
1729- batch<uint16_t , A>(low_all),
1730- batch<uint16_t , A>(high_all));
1731- // return select(lane_mask, low_all, high_all);
1761+ constexpr batch_bool_constant<int16_t , A, (V0 < 4 ), (V1 < 4 ), (V2 < 4 ), (V3 < 4 ), (V4 < 4 ), (V5 < 4 ), (V6 < 4 ), (V7 < 4 )> lane_mask {};
1762+ return select (lane_mask, batch<int16_t >(low_all), batch<int16_t >(high_all));
17321763 }
1733-
17341764 template <class A , uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1735- XSIMD_INLINE batch<int16_t , A> swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
1765+ XSIMD_INLINE batch<uint16_t , A> swizzle (batch<uint16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
17361766 {
1737- return bitwise_cast<int16_t >(swizzle (bitwise_cast<uint16_t >(self), mask, sse2 {}));
1767+ return bitwise_cast<uint16_t >(swizzle (bitwise_cast<int16_t >(self), mask, sse2 {}));
17381768 }
17391769
17401770 // transpose
@@ -1854,7 +1884,6 @@ namespace xsimd
18541884 {
18551885 return _mm_unpacklo_pd (self, other);
18561886 }
1857-
18581887 }
18591888}
18601889
0 commit comments