WIP

DiamonDinoia · DiamonDinoia · commit 516e302aa7e0 · 2025-10-16T13:31:07.000-04:00
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -253,8 +253,15 @@ namespace xsimd
                                                    Mode,
                                                    requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half (8 lanes)
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return _mm512_setzero_si512();
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                return load<A>(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = load_masked<avx2>(mem, mlo, convert<int32_t> {}, Mode {}, avx2 {});
@@ -300,8 +307,15 @@ namespace xsimd
                                                    Mode,
                                                    requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half (4 lanes)
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return _mm512_setzero_si512();
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                return load<A>(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = load_masked<avx2>(mem, mlo, convert<int64_t> {}, Mode {}, avx2 {});
@@ -1510,8 +1524,15 @@ namespace xsimd
                                                  Mode,
                                                  requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return _mm512_setzero_ps();
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                return load<A>(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = load_masked<avx2>(mem, mlo, convert<float> {}, Mode {}, avx2 {});
@@ -1544,8 +1565,15 @@ namespace xsimd
                                                   Mode,
                                                   requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return _mm512_setzero_pd();
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                return load<A>(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = load_masked<avx2>(mem, mlo, convert<double> {}, Mode {}, avx2 {});
@@ -1578,8 +1606,15 @@ namespace xsimd
                                        Mode,
                                        requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return;
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                src.store(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = detail::lower_half(src);
@@ -1612,8 +1647,15 @@ namespace xsimd
                                        Mode,
                                        requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return;
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                src.store(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = detail::lower_half(src);
@@ -1647,8 +1689,15 @@ namespace xsimd
                                        Mode,
                                        requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half (8 lanes)
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return;
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                src.store(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 8) // Forward to AVX2 when confined to a 256-bit half (8 lanes)
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = detail::lower_half(src);
@@ -1694,8 +1743,15 @@ namespace xsimd
                                        Mode,
                                        requires_arch<avx512f>) noexcept
         {
-            // Forward to AVX2 when confined to a 256-bit half (4 lanes)
-            XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
+            XSIMD_IF_CONSTEXPR(mask.none())
+            {
+                return;
+            }
+            else XSIMD_IF_CONSTEXPR(mask.all())
+            {
+                src.store(mem, Mode {});
+            }
+            else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) // Forward to AVX2 when confined to a 256-bit half (4 lanes)
             {
                 constexpr auto mlo = mask.template lower_half<avx2>();
                 const auto lo = detail::lower_half(src);
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
@@ -353,7 +353,7 @@ struct load_store_test
     template <class Generator, class V>
     void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label)
     {
-        auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
+        constexpr auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();
         array_type expected_masked{0};
 
         for (std::size_t i = 0; i < size; ++i)

Original file line number	Diff line number	Diff line change
`@@ -353,7 +353,7 @@ struct load_store_test`
`353`	`353`	`template <class Generator, class V>`
`354`	`354`	`void run_load_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label)`
`355`	`355`	`{`
`356`		`- auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();`
	`356`	`+ constexpr auto mask = xsimd::make_batch_bool_constant<value_type, Generator, typename batch_type::arch_type>();`
`357`	`357`	`array_type expected_masked{0};`
`358`	`358`
`359`	`359`	`for (std::size_t i = 0; i < size; ++i)`