@@ -253,8 +253,15 @@ namespace xsimd
253253 Mode,
254254 requires_arch<avx512f>) noexcept
255255 {
256- // Forward to AVX2 when confined to a 256-bit half (8 lanes)
257- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 )
256+ XSIMD_IF_CONSTEXPR (mask.none ())
257+ {
258+ return _mm512_setzero_si512 ();
259+ }
260+ else XSIMD_IF_CONSTEXPR (mask.all ())
261+ {
262+ return load<A>(mem, Mode {});
263+ }
264+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 ) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
258265 {
259266 constexpr auto mlo = mask.template lower_half <avx2>();
260267 const auto lo = load_masked<avx2>(mem, mlo, convert<int32_t > {}, Mode {}, avx2 {});
@@ -300,8 +307,15 @@ namespace xsimd
300307 Mode,
301308 requires_arch<avx512f>) noexcept
302309 {
303- // Forward to AVX2 when confined to a 256-bit half (4 lanes)
304- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
310+ XSIMD_IF_CONSTEXPR (mask.none ())
311+ {
312+ return _mm512_setzero_si512 ();
313+ }
314+ else XSIMD_IF_CONSTEXPR (mask.all ())
315+ {
316+ return load<A>(mem, Mode {});
317+ }
318+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 ) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
305319 {
306320 constexpr auto mlo = mask.template lower_half <avx2>();
307321 const auto lo = load_masked<avx2>(mem, mlo, convert<int64_t > {}, Mode {}, avx2 {});
@@ -1510,8 +1524,15 @@ namespace xsimd
15101524 Mode,
15111525 requires_arch<avx512f>) noexcept
15121526 {
1513- // Forward to AVX2 when confined to a 256-bit half
1514- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 )
1527+ XSIMD_IF_CONSTEXPR (mask.none ())
1528+ {
1529+ return _mm512_setzero_ps ();
1530+ }
1531+ else XSIMD_IF_CONSTEXPR (mask.all ())
1532+ {
1533+ return load<A>(mem, Mode {});
1534+ }
1535+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 ) // Forward to AVX2 when confined to a 256-bit half
15151536 {
15161537 constexpr auto mlo = mask.template lower_half <avx2>();
15171538 const auto lo = load_masked<avx2>(mem, mlo, convert<float > {}, Mode {}, avx2 {});
@@ -1544,8 +1565,15 @@ namespace xsimd
15441565 Mode,
15451566 requires_arch<avx512f>) noexcept
15461567 {
1547- // Forward to AVX2 when confined to a 256-bit half
1548- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
1568+ XSIMD_IF_CONSTEXPR (mask.none ())
1569+ {
1570+ return _mm512_setzero_pd ();
1571+ }
1572+ else XSIMD_IF_CONSTEXPR (mask.all ())
1573+ {
1574+ return load<A>(mem, Mode {});
1575+ }
1576+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 ) // Forward to AVX2 when confined to a 256-bit half
15491577 {
15501578 constexpr auto mlo = mask.template lower_half <avx2>();
15511579 const auto lo = load_masked<avx2>(mem, mlo, convert<double > {}, Mode {}, avx2 {});
@@ -1578,8 +1606,15 @@ namespace xsimd
15781606 Mode,
15791607 requires_arch<avx512f>) noexcept
15801608 {
1581- // Forward to AVX2 when confined to a 256-bit half
1582- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 )
1609+ XSIMD_IF_CONSTEXPR (mask.none ())
1610+ {
1611+ return ;
1612+ }
1613+ else XSIMD_IF_CONSTEXPR (mask.all ())
1614+ {
1615+ src.store (mem, Mode {});
1616+ }
1617+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 ) // Forward to AVX2 when confined to a 256-bit half
15831618 {
15841619 constexpr auto mlo = mask.template lower_half <avx2>();
15851620 const auto lo = detail::lower_half (src);
@@ -1612,8 +1647,15 @@ namespace xsimd
16121647 Mode,
16131648 requires_arch<avx512f>) noexcept
16141649 {
1615- // Forward to AVX2 when confined to a 256-bit half
1616- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
1650+ XSIMD_IF_CONSTEXPR (mask.none ())
1651+ {
1652+ return ;
1653+ }
1654+ else XSIMD_IF_CONSTEXPR (mask.all ())
1655+ {
1656+ src.store (mem, Mode {});
1657+ }
1658+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 ) // Forward to AVX2 when confined to a 256-bit half
16171659 {
16181660 constexpr auto mlo = mask.template lower_half <avx2>();
16191661 const auto lo = detail::lower_half (src);
@@ -1647,8 +1689,15 @@ namespace xsimd
16471689 Mode,
16481690 requires_arch<avx512f>) noexcept
16491691 {
1650- // Forward to AVX2 when confined to a 256-bit half (8 lanes)
1651- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 )
1692+ XSIMD_IF_CONSTEXPR (mask.none ())
1693+ {
1694+ return ;
1695+ }
1696+ else XSIMD_IF_CONSTEXPR (mask.all ())
1697+ {
1698+ src.store (mem, Mode {});
1699+ }
1700+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 8 ) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
16521701 {
16531702 constexpr auto mlo = mask.template lower_half <avx2>();
16541703 const auto lo = detail::lower_half (src);
@@ -1694,8 +1743,15 @@ namespace xsimd
16941743 Mode,
16951744 requires_arch<avx512f>) noexcept
16961745 {
1697- // Forward to AVX2 when confined to a 256-bit half (4 lanes)
1698- XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
1746+ XSIMD_IF_CONSTEXPR (mask.none ())
1747+ {
1748+ return ;
1749+ }
1750+ else XSIMD_IF_CONSTEXPR (mask.all ())
1751+ {
1752+ src.store (mem, Mode {});
1753+ }
1754+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 ) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
16991755 {
17001756 constexpr auto mlo = mask.template lower_half <avx2>();
17011757 const auto lo = detail::lower_half (src);
0 commit comments