@@ -1629,88 +1629,74 @@ namespace xsimd
16291629 }
16301630 return split;
16311631 }
1632- // Duplicate lanes separately
1633- // 1) duplicate low and high lanes
1634- __m256 low_dup = _mm256_permute2f128_ps (self, self, 0x00 ); // [low | low]
1635- __m256 hi_dup = _mm256_permute2f128_ps (self, self, 0x11 ); // [high| high]
16361632
1637- // 2) build lane-local index vector (each element = source_index & 3)
1638- constexpr batch_constant< uint32_t , A, (V0 % 4 ), (V1 % 4 ), (V2 % 4 ), (V3 % 4 ), (V4 % 4 ), (V5 % 4 ), (V6 % 4 ), (V7 % 4 )> half_mask;
1633+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1634+ // that possible operations are done at compile time.
16391635
1640- __m256 r0 = _mm256_permutevar_ps (low_dup, half_mask.as_batch ()); // pick from low lane
1641- __m256 r1 = _mm256_permutevar_ps (hi_dup, half_mask.as_batch ()); // pick from high lane
1636+ // swap lanes
1637+ __m256 swapped = _mm256_permute2f128_ps (self, self, 0x01 ); // [high | low]
1638+
1639+ // normalize mask taking modulo 4
1640+ constexpr auto half_mask = mask % make_batch_constant<uint32_t , 4 , A>();
1641+
1642+ // permute within each lane
1643+ __m256 r0 = _mm256_permutevar_ps (self, half_mask.as_batch ());
1644+ __m256 r1 = _mm256_permutevar_ps (swapped, half_mask.as_batch ());
16421645
1643- constexpr batch_bool_constant<uint32_t , A, (V0 >= 4 ), (V1 >= 4 ), (V2 >= 4 ), (V3 >= 4 ), (V4 >= 4 ), (V5 >= 4 ), (V6 >= 4 ), (V7 >= 4 )> lane_mask {};
1646+ // select lane by the mask index divided by 4
1647+ constexpr auto lane = batch_constant<uint32_t , A, 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 > {};
1648+ constexpr int lane_mask = ((mask / make_batch_constant<uint32_t , 4 , A>()) != lane).mask ();
16441649
1645- return _mm256_blend_ps (r0, r1, lane_mask. mask () );
1650+ return _mm256_blend_ps (r0, r1, lane_mask);
16461651 }
16471652
16481653 template <class A , uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
16491654 XSIMD_INLINE batch<double , A> swizzle (batch<double , A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> mask, requires_arch<avx>) noexcept
16501655 {
16511656 // cannot use detail::mod_shuffle as the mod and shift are different in this case
1652- constexpr auto imm = ((V0 & 1 ) << 0 ) | ((V1 & 1 ) << 1 ) | ((V2 & 1 ) << 2 ) | ((V3 & 1 ) << 3 );
1657+ constexpr auto imm = ((V0 % 2 ) << 0 ) | ((V1 % 2 ) << 1 ) | ((V2 % 2 ) << 2 ) | ((V3 % 2 ) << 3 );
16531658 XSIMD_IF_CONSTEXPR (detail::is_identity (mask)) { return self; }
16541659 XSIMD_IF_CONSTEXPR (!detail::is_cross_lane (mask))
16551660 {
16561661 return _mm256_permute_pd (self, imm);
16571662 }
1658- // duplicate low and high part of input
1659- __m256d lo = _mm256_permute2f128_pd (self, self, 0x00 );
1660- __m256d hi = _mm256_permute2f128_pd (self, self, 0x11 );
1663+
1664+ // Fallback to general algorithm. This is the same as the dynamic version with the exception
1665+ // that possible operations are done at compile time.
1666+
1667+ // swap lanes
1668+ __m256d swapped = _mm256_permute2f128_pd (self, self, 0x01 ); // [high | low]
16611669
16621670 // permute within each lane
1663- __m256d r0 = _mm256_permute_pd (lo , imm);
1664- __m256d r1 = _mm256_permute_pd (hi , imm);
1671+ __m256d r0 = _mm256_permute_pd (self , imm);
1672+ __m256d r1 = _mm256_permute_pd (swapped , imm);
16651673
1666- // mask to choose the right lane
1667- constexpr batch_bool_constant<uint64_t , A, (V0 >= 2 ), (V1 >= 2 ), (V2 >= 2 ), (V3 >= 2 )> blend_mask;
1674+ // select lane by the mask index divided by 2
1675+ constexpr auto lane = batch_constant<uint64_t , A, 0 , 0 , 1 , 1 > {};
1676+ constexpr int lane_mask = ((mask / make_batch_constant<uint64_t , 2 , A>()) != lane).mask ();
16681677
16691678 // blend the two permutes
1670- return _mm256_blend_pd (r0, r1, blend_mask.mask ());
1671- }
1672- template <class A ,
1673- typename T,
1674- uint32_t V0,
1675- uint32_t V1,
1676- uint32_t V2,
1677- uint32_t V3,
1678- uint32_t V4,
1679- uint32_t V5,
1680- uint32_t V6,
1681- uint32_t V7,
1682- detail::enable_sized_integral_t <T, 4 > = 0 >
1683- XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self,
1684- batch_constant<uint32_t , A,
1685- V0,
1686- V1,
1687- V2,
1688- V3,
1689- V4,
1690- V5,
1691- V6,
1692- V7> const & mask,
1693- requires_arch<avx>) noexcept
1679+ return _mm256_blend_pd (r0, r1, lane_mask);
1680+ }
1681+
1682+ template <
1683+ class A , typename T,
1684+ uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7,
1685+ detail::enable_sized_integral_t <T, 4 > = 0 >
1686+ XSIMD_INLINE batch<T, A> swizzle (
1687+ batch<T, A> const & self,
1688+ batch_constant<uint32_t , A, V0, V1, V2, V3, V4, V5, V6, V7> const & mask,
1689+ requires_arch<avx>) noexcept
16941690 {
1695- return bitwise_cast<T>(
1696- swizzle (bitwise_cast<float >(self), mask));
1691+ return bitwise_cast<T>(swizzle (bitwise_cast<float >(self), mask));
16971692 }
16981693
1699- template <class A ,
1700- typename T,
1701- uint64_t V0,
1702- uint64_t V1,
1703- uint64_t V2,
1704- uint64_t V3,
1705- detail::enable_sized_integral_t <T, 8 > = 0 >
1706- XSIMD_INLINE batch<T, A>
1707- swizzle (batch<T, A> const & self,
1708- batch_constant<uint64_t , A, V0, V1, V2, V3> const & mask,
1709- requires_arch<avx>) noexcept
1694+ template <class A , typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_integral_t <T, 8 > = 0 >
1695+ XSIMD_INLINE batch<T, A> swizzle (batch<T, A> const & self, batch_constant<uint64_t , A, V0, V1, V2, V3> const & mask, requires_arch<avx>) noexcept
17101696 {
1711- return bitwise_cast<T>(
1712- swizzle (bitwise_cast<double >(self), mask));
1697+ return bitwise_cast<T>(swizzle (bitwise_cast<double >(self), mask));
17131698 }
1699+
17141700 // transpose
17151701 template <class A >
17161702 XSIMD_INLINE void transpose (batch<float , A>* matrix_begin, batch<float , A>* matrix_end, requires_arch<avx>) noexcept
0 commit comments