Skip to content

Commit f20dcec

Browse files
committed
Use utility function in SSE2 swizzle
1 parent 239fbbd commit f20dcec

File tree

1 file changed

+4
-5
lines changed

1 file changed

+4
-5
lines changed

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1952,8 +1952,7 @@ namespace xsimd
19521952
return _mm_sub_pd(self, other);
19531953
}
19541954

1955-
// swizzle
1956-
1955+
// swizzle (constant mask)
19571956
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
19581957
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
19591958
{
@@ -2024,7 +2023,7 @@ namespace xsimd
20242023
return hi_all;
20252024
}
20262025
// Only pick elements from the low lane
2027-
XSIMD_IF_CONSTEXPR((V0 < 4) && (V1 < 4) && (V2 < 4) && (V3 < 4) && (V4 < 4) && (V5 < 4) && (V6 < 4) && (V7 < 4))
2026+
XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask))
20282027
{
20292028
// permute within each sub lane
20302029
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
@@ -2036,7 +2035,7 @@ namespace xsimd
20362035
return _mm_unpacklo_epi64(lol, loh);
20372036
}
20382037
// Only pick elements from the high lane
2039-
XSIMD_IF_CONSTEXPR((V0 >= 4) && (V1 >= 4) && (V2 >= 4) && (V3 >= 4) && (V4 >= 4) && (V5 >= 4) && (V6 >= 4) && (V7 >= 4))
2038+
XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask))
20402039
{
20412040
// permute within each sub lane
20422041
constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
@@ -2063,7 +2062,7 @@ namespace xsimd
20632062
__m128i hi = _mm_unpackhi_epi64(hil, hih);
20642063

20652064
// mask to choose the right lane
2066-
batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
2065+
constexpr auto blend_mask = mask < make_batch_constant<uint16_t, 4, A>();
20672066

20682067
// blend the two permutes
20692068
return select(blend_mask, batch<uint16_t, A>(lo), batch<uint16_t, A>(hi));

0 commit comments

Comments
 (0)