@@ -33,6 +33,28 @@ namespace xsimd
3333 {
3434 using namespace types ;
3535
36+ namespace detail
37+ {
38+ constexpr uint32_t shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
39+ {
40+ return (z << 6 ) | (y << 4 ) | (x << 2 ) | w;
41+ }
42+ constexpr uint32_t shuffle (uint32_t x, uint32_t y)
43+ {
44+ return (y << 1 ) | x;
45+ }
46+
47+ constexpr uint32_t mod_shuffle (uint32_t w, uint32_t x, uint32_t y, uint32_t z)
48+ {
49+ return shuffle (w % 4 , x % 4 , y % 4 , z % 4 );
50+ }
51+
52+ constexpr uint32_t mod_shuffle (uint32_t w, uint32_t x)
53+ {
54+ return shuffle (w % 2 , x % 2 );
55+ }
56+ }
57+
3658 // fwd
3759 template <class A , class T , size_t I>
3860 XSIMD_INLINE batch<T, A> insert (batch<T, A> const & self, T val, index<I>, requires_arch<common>) noexcept ;
@@ -1282,13 +1304,16 @@ namespace xsimd
12821304 template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
12831305 XSIMD_INLINE T reduce_max (batch<T, A> const & self, requires_arch<sse2>) noexcept
12841306 {
1285- batch<T, A> step0 = _mm_shuffle_epi32 (self, detail::shuffle<2 , 3 , 0 , 0 >());
1307+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1308+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
12861309 batch<T, A> acc0 = max (self, step0);
12871310
1288- batch<T, A> step1 = _mm_shuffle_epi32 (acc0, detail::shuffle<1 , 0 , 0 , 0 >());
1311+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1312+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
12891313 batch<T, A> acc1 = max (acc0, step1);
12901314
1291- batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, detail::shuffle<1 , 0 , 0 , 0 >());
1315+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1316+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
12921317 batch<T, A> acc2 = max (acc1, step2);
12931318 if (sizeof (T) == 2 )
12941319 return first (acc2, A {});
@@ -1301,13 +1326,16 @@ namespace xsimd
13011326 template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
13021327 XSIMD_INLINE T reduce_min (batch<T, A> const & self, requires_arch<sse2>) noexcept
13031328 {
1304- batch<T, A> step0 = _mm_shuffle_epi32 (self, detail::shuffle<2 , 3 , 0 , 0 >());
1329+ constexpr auto mask0 = detail::shuffle (2 , 3 , 0 , 0 );
1330+ batch<T, A> step0 = _mm_shuffle_epi32 (self, mask0);
13051331 batch<T, A> acc0 = min (self, step0);
13061332
1307- batch<T, A> step1 = _mm_shuffle_epi32 (acc0, detail::shuffle<1 , 0 , 0 , 0 >());
1333+ constexpr auto mask1 = detail::shuffle (1 , 0 , 0 , 0 );
1334+ batch<T, A> step1 = _mm_shuffle_epi32 (acc0, mask1);
13081335 batch<T, A> acc1 = min (acc0, step1);
13091336
1310- batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, detail::shuffle<1 , 0 , 0 , 0 >());
1337+ constexpr auto mask2 = detail::shuffle (1 , 0 , 0 , 0 );
1338+ batch<T, A> step2 = _mm_shufflelo_epi16 (acc1, mask2);
13111339 batch<T, A> acc2 = min (acc1, step2);
13121340 if (sizeof (T) == 2 )
13131341 return first (acc2, A {});
@@ -1355,7 +1383,7 @@ namespace xsimd
13551383 template <class A , class ITy , ITy I0, ITy I1, ITy I2, ITy I3>
13561384 XSIMD_INLINE batch<float , A> shuffle (batch<float , A> const & x, batch<float , A> const & y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
13571385 {
1358- constexpr uint32_t smask = detail::mod_shuffle< I0, I1, I2, I3>( );
1386+ constexpr uint32_t smask = detail::mod_shuffle ( I0, I1, I2, I3);
13591387 // shuffle within lane
13601388 if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4 )
13611389 return _mm_shuffle_ps (x, y, smask);
@@ -1369,7 +1397,7 @@ namespace xsimd
13691397 template <class A , class ITy , ITy I0, ITy I1>
13701398 XSIMD_INLINE batch<double , A> shuffle (batch<double , A> const & x, batch<double , A> const & y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
13711399 {
1372- constexpr uint32_t smask = detail::mod_shuffle< I0, I1>( );
1400+ constexpr uint32_t smask = detail::mod_shuffle ( I0, I1);
13731401 // shuffle within lane
13741402 if (I0 < 2 && I1 >= 2 )
13751403 return _mm_shuffle_pd (x, y, smask);
@@ -1617,26 +1645,24 @@ namespace xsimd
16171645 return _mm_sub_pd (self, other);
16181646 }
16191647
1620- // swizzle
1621-
16221648 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
16231649 XSIMD_INLINE batch<float , A> swizzle (batch<float , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
16241650 {
1625- constexpr uint32_t index = detail::shuffle< V0, V1, V2, V3>( );
1651+ constexpr uint32_t index = detail::shuffle ( V0, V1, V2, V3);
16261652 return _mm_shuffle_ps (self, self, index);
16271653 }
16281654
16291655 template <class A , uint64_t V0, uint64_t V1>
16301656 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<sse2>) noexcept
16311657 {
1632- constexpr uint32_t index = detail::shuffle< V0, V1>( );
1658+ constexpr uint32_t index = detail::shuffle ( V0, V1);
16331659 return _mm_shuffle_pd (self, self, index);
16341660 }
16351661
16361662 template <class A , uint64_t V0, uint64_t V1>
16371663 XSIMD_INLINE batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<uint64_t , A, V0, V1>, requires_arch<sse2>) noexcept
16381664 {
1639- constexpr uint32_t index = detail::shuffle< 2 * V0, 2 * V0 + 1 , 2 * V1, 2 * V1 + 1 >( );
1665+ constexpr uint32_t index = detail::shuffle ( 2 * V0, 2 * V0 + 1 , 2 * V1, 2 * V1 + 1 );
16401666 return _mm_shuffle_epi32 (self, index);
16411667 }
16421668
@@ -1649,7 +1675,7 @@ namespace xsimd
16491675 template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
16501676 XSIMD_INLINE batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<uint32_t , A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
16511677 {
1652- constexpr uint32_t index = detail::shuffle< V0, V1, V2, V3>( );
1678+ constexpr uint32_t index = detail::shuffle ( V0, V1, V2, V3);
16531679 return _mm_shuffle_epi32 (self, index);
16541680 }
16551681
@@ -1663,8 +1689,8 @@ namespace xsimd
16631689 XSIMD_INLINE batch<int16_t , A>
16641690 swizzle (batch<int16_t , A> const & self, batch_constant<uint16_t , A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
16651691 {
1666- constexpr int imm_lo = detail::mod_shuffle< V0, V1, V2, V3>( );
1667- constexpr int imm_hi = detail::mod_shuffle< V4, V5, V6, V7>( );
1692+ constexpr int imm_lo = detail::mod_shuffle ( V0, V1, V2, V3);
1693+ constexpr int imm_hi = detail::mod_shuffle ( V4, V5, V6, V7);
16681694 // 0) identity?
16691695 constexpr bool identity = detail::is_identity (mask);
16701696 XSIMD_IF_CONSTEXPR (identity)
@@ -1735,6 +1761,7 @@ namespace xsimd
17351761 {
17361762 return bitwise_cast<uint16_t >(swizzle (bitwise_cast<int16_t >(self), mask, sse2 {}));
17371763 }
1764+
17381765 // transpose
17391766 template <class A >
17401767 XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<sse2>) noexcept
@@ -1852,7 +1879,8 @@ namespace xsimd
18521879 {
18531880 return _mm_unpacklo_pd (self, other);
18541881 }
1882+
18551883 }
18561884}
18571885
1858- #endif
1886+ #endif
0 commit comments