Skip to content

Commit 516e302

Browse files
committed
WIP
1 parent abb0b2c commit 516e302

File tree

2 files changed

+73
-17
lines changed

2 files changed

+73
-17
lines changed

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 72 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,15 @@ namespace xsimd
253253
Mode,
254254
requires_arch<avx512f>) noexcept
255255
{
256-
// Forward to AVX2 when confined to a 256-bit half (8 lanes)
257-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
256+
XSIMD_IF_CONSTEXPR(mask.none())
257+
{
258+
return _mm512_setzero_si512();
259+
}
260+
else XSIMD_IF_CONSTEXPR(mask.all())
261+
{
262+
return load<A>(mem, Mode {});
263+
}
264+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
258265
{
259266
constexpr auto mlo = mask.template lower_half<avx2>();
260267
const auto lo = load_masked<avx2>(mem, mlo, convert<int32_t> {}, Mode {}, avx2 {});
@@ -300,8 +307,15 @@ namespace xsimd
300307
Mode,
301308
requires_arch<avx512f>) noexcept
302309
{
303-
// Forward to AVX2 when confined to a 256-bit half (4 lanes)
304-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
310+
XSIMD_IF_CONSTEXPR(mask.none())
311+
{
312+
return _mm512_setzero_si512();
313+
}
314+
else XSIMD_IF_CONSTEXPR(mask.all())
315+
{
316+
return load<A>(mem, Mode {});
317+
}
318+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
305319
{
306320
constexpr auto mlo = mask.template lower_half<avx2>();
307321
const auto lo = load_masked<avx2>(mem, mlo, convert<int64_t> {}, Mode {}, avx2 {});
@@ -1510,8 +1524,15 @@ namespace xsimd
15101524
Mode,
15111525
requires_arch<avx512f>) noexcept
15121526
{
1513-
// Forward to AVX2 when confined to a 256-bit half
1514-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
1527+
XSIMD_IF_CONSTEXPR(mask.none())
1528+
{
1529+
return _mm512_setzero_ps();
1530+
}
1531+
else XSIMD_IF_CONSTEXPR(mask.all())
1532+
{
1533+
return load<A>(mem, Mode {});
1534+
}
1535+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half
15151536
{
15161537
constexpr auto mlo = mask.template lower_half<avx2>();
15171538
const auto lo = load_masked<avx2>(mem, mlo, convert<float> {}, Mode {}, avx2 {});
@@ -1544,8 +1565,15 @@ namespace xsimd
15441565
Mode,
15451566
requires_arch<avx512f>) noexcept
15461567
{
1547-
// Forward to AVX2 when confined to a 256-bit half
1548-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
1568+
XSIMD_IF_CONSTEXPR(mask.none())
1569+
{
1570+
return _mm512_setzero_pd();
1571+
}
1572+
else XSIMD_IF_CONSTEXPR(mask.all())
1573+
{
1574+
return load<A>(mem, Mode {});
1575+
}
1576+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half
15491577
{
15501578
constexpr auto mlo = mask.template lower_half<avx2>();
15511579
const auto lo = load_masked<avx2>(mem, mlo, convert<double> {}, Mode {}, avx2 {});
@@ -1578,8 +1606,15 @@ namespace xsimd
15781606
Mode,
15791607
requires_arch<avx512f>) noexcept
15801608
{
1581-
// Forward to AVX2 when confined to a 256-bit half
1582-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
1609+
XSIMD_IF_CONSTEXPR(mask.none())
1610+
{
1611+
return;
1612+
}
1613+
else XSIMD_IF_CONSTEXPR(mask.all())
1614+
{
1615+
src.store(mem, Mode {});
1616+
}
1617+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half
15831618
{
15841619
constexpr auto mlo = mask.template lower_half<avx2>();
15851620
const auto lo = detail::lower_half(src);
@@ -1612,8 +1647,15 @@ namespace xsimd
16121647
Mode,
16131648
requires_arch<avx512f>) noexcept
16141649
{
1615-
// Forward to AVX2 when confined to a 256-bit half
1616-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
1650+
XSIMD_IF_CONSTEXPR(mask.none())
1651+
{
1652+
return;
1653+
}
1654+
else XSIMD_IF_CONSTEXPR(mask.all())
1655+
{
1656+
src.store(mem, Mode {});
1657+
}
1658+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half
16171659
{
16181660
constexpr auto mlo = mask.template lower_half<avx2>();
16191661
const auto lo = detail::lower_half(src);
@@ -1647,8 +1689,15 @@ namespace xsimd
16471689
Mode,
16481690
requires_arch<avx512f>) noexcept
16491691
{
1650-
// Forward to AVX2 when confined to a 256-bit half (8 lanes)
1651-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
1692+
XSIMD_IF_CONSTEXPR(mask.none())
1693+
{
1694+
return;
1695+
}
1696+
else XSIMD_IF_CONSTEXPR(mask.all())
1697+
{
1698+
src.store(mem, Mode {});
1699+
}
1700+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
16521701
{
16531702
constexpr auto mlo = mask.template lower_half<avx2>();
16541703
const auto lo = detail::lower_half(src);
@@ -1694,8 +1743,15 @@ namespace xsimd
16941743
Mode,
16951744
requires_arch<avx512f>) noexcept
16961745
{
1697-
// Forward to AVX2 when confined to a 256-bit half (4 lanes)
1698-
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
1746+
XSIMD_IF_CONSTEXPR(mask.none())
1747+
{
1748+
return;
1749+
}
1750+
else XSIMD_IF_CONSTEXPR(mask.all())
1751+
{
1752+
src.store(mem, Mode {});
1753+
}
1754+
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
16991755
{
17001756
constexpr auto mlo = mask.template lower_half<avx2>();
17011757
const auto lo = detail::lower_half(src);

test/test_load_store.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ struct load_store_test
353353
template <class Generator, class V>
354354
void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label)
355355
{
356-
auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
356+
constexpr auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
357357
array_type expected_masked{0};
358358

359359
for (std::size_t i = 0; i < size; ++i)

0 commit comments

Comments
 (0)