@@ -36,20 +36,35 @@ namespace xsimd
3636
3737 namespace detail
3838 {
39- XSIMD_INLINE void split_avx (__m256i val, __m128i& low, __m128i& high ) noexcept
39+ XSIMD_INLINE __m128i lower_half (__m256i self ) noexcept
4040 {
41- low = _mm256_castsi256_si128 (val);
42- high = _mm256_extractf128_si256 (val, 1 );
41+ return _mm256_castsi256_si128 (self);
4342 }
44- XSIMD_INLINE void split_avx (__m256 val, __m128& low, __m128& high ) noexcept
43+ XSIMD_INLINE __m128 lower_half (__m256 self ) noexcept
4544 {
46- low = _mm256_castps256_ps128 (val);
47- high = _mm256_extractf128_ps (val, 1 );
45+ return _mm256_castps256_ps128 (self);
4846 }
49- XSIMD_INLINE void split_avx (__m256d val, __m128d& low, __m128d& high ) noexcept
47+ XSIMD_INLINE __m128d lower_half (__m256d self ) noexcept
5048 {
51- low = _mm256_castpd256_pd128 (val);
52- high = _mm256_extractf128_pd (val, 1 );
49+ return _mm256_castpd256_pd128 (self);
50+ }
51+ XSIMD_INLINE __m128i upper_half (__m256i self) noexcept
52+ {
53+ return _mm256_extractf128_si256 (self, 1 );
54+ }
55+ XSIMD_INLINE __m128 upper_half (__m256 self) noexcept
56+ {
57+ return _mm256_extractf128_ps (self, 1 );
58+ }
59+ XSIMD_INLINE __m128d upper_half (__m256d self) noexcept
60+ {
61+ return _mm256_extractf128_pd (self, 1 );
62+ }
63+ template <class Full , class Half >
64+ XSIMD_INLINE void split_avx (Full val, Half& low, Half& high) noexcept
65+ {
66+ low = lower_half (val);
67+ high = upper_half (val);
5368 }
5469 XSIMD_INLINE __m256i merge_sse (__m128i low, __m128i high) noexcept
5570 {
@@ -63,6 +78,17 @@ namespace xsimd
6378 {
6479 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (low), high, 1 );
6580 }
81+ template <class T >
82+ XSIMD_INLINE batch<T, sse4_2> lower_half (batch<T, avx> const & self) noexcept
83+ {
84+ return lower_half (self);
85+ }
86+ template <class T >
87+ XSIMD_INLINE batch<T, sse4_2> upper_half (batch<T, avx> const & self) noexcept
88+ {
89+ return upper_half (self);
90+ }
91+
6692 template <class F >
6793 XSIMD_INLINE __m256i fwd_to_sse (F f, __m256i self) noexcept
6894 {
@@ -865,6 +891,146 @@ namespace xsimd
865891 return _mm256_loadu_pd (mem);
866892 }
867893
894+ // load_masked
895+ template <class A , bool ... Values, class Mode >
896+ XSIMD_INLINE batch<float , A> load_masked (float const * mem, batch_bool_constant<float , A, Values...> mask, convert<float >, Mode, requires_arch<avx>) noexcept
897+ {
898+ XSIMD_IF_CONSTEXPR (mask.none ())
899+ {
900+ return _mm256_setzero_ps ();
901+ }
902+ else XSIMD_IF_CONSTEXPR (mask.all ())
903+ {
904+ return load<A>(mem, Mode {});
905+ }
906+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
907+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
908+ {
909+ constexpr auto mlo = mask.template lower_half <sse4_2>();
910+ const auto lo = load_masked (mem, mlo, convert<float > {}, Mode {}, sse4_2 {});
911+ return batch<float , A>(detail::merge_sse (lo, batch<float , sse4_2>(0 .f )));
912+ }
913+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
914+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
915+ {
916+ constexpr auto mhi = mask.template upper_half <sse4_2>();
917+ const auto hi = load_masked (mem + 4 , mhi, convert<float > {}, Mode {}, sse4_2 {});
918+ return batch<float , A>(detail::merge_sse (batch<float , sse4_2>(0 .f ), hi));
919+ }
920+ else
921+ {
922+ // crossing 128-bit boundary → use 256-bit masked load
923+ return _mm256_maskload_ps (mem, mask.as_batch ());
924+ }
925+ }
926+
927+ template <class A , bool ... Values, class Mode >
928+ XSIMD_INLINE batch<double , A> load_masked (double const * mem,
929+ batch_bool_constant<double , A, Values...> mask,
930+ convert<double >,
931+ Mode,
932+ requires_arch<avx>) noexcept
933+ {
934+ XSIMD_IF_CONSTEXPR (mask.none ())
935+ {
936+ return _mm256_setzero_pd ();
937+ }
938+ else XSIMD_IF_CONSTEXPR (mask.all ())
939+ {
940+ return load<A>(mem, Mode {});
941+ }
942+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
943+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
944+ {
945+ constexpr auto mlo = mask.template lower_half <sse4_2>();
946+ const auto lo = load_masked (mem, mlo, convert<double > {}, Mode {}, sse4_2 {});
947+ return batch<double , A>(detail::merge_sse (lo, batch<double , sse4_2>(0.0 )));
948+ }
949+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
950+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
951+ {
952+ constexpr auto mhi = mask.template upper_half <sse4_2>();
953+ const auto hi = load_masked (mem + 2 , mhi, convert<double > {}, Mode {}, sse4_2 {});
954+ return batch<double , A>(detail::merge_sse (batch<double , sse4_2>(0.0 ), hi));
955+ }
956+ else
957+ {
958+ // crossing 128-bit boundary → use 256-bit masked load
959+ return _mm256_maskload_pd (mem, mask.as_batch ());
960+ }
961+ }
962+
963+ // store_masked
964+ template <class A , bool ... Values, class Mode >
965+ XSIMD_INLINE void store_masked (float * mem,
966+ batch<float , A> const & src,
967+ batch_bool_constant<float , A, Values...> mask,
968+ Mode,
969+ requires_arch<avx>) noexcept
970+ {
971+ XSIMD_IF_CONSTEXPR (mask.none ())
972+ {
973+ return ;
974+ }
975+ else XSIMD_IF_CONSTEXPR (mask.all ())
976+ {
977+ src.store (mem, Mode {});
978+ }
979+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
980+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
981+ {
982+ constexpr auto mlo = mask.template lower_half <sse4_2>();
983+ const batch<float , sse4_2> lo (_mm256_castps256_ps128 (src));
984+ store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
985+ }
986+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
987+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
988+ {
989+ constexpr auto mhi = mask.template upper_half <sse4_2>();
990+ const batch<float , sse4_2> hi (_mm256_extractf128_ps (src, 1 ));
991+ store_masked<sse4_2>(mem + 4 , hi, mhi, Mode {}, sse4_2 {});
992+ }
993+ else
994+ {
995+ _mm256_maskstore_ps (mem, mask.as_batch (), src);
996+ }
997+ }
998+
999+ template <class A , bool ... Values, class Mode >
1000+ XSIMD_INLINE void store_masked (double * mem,
1001+ batch<double , A> const & src,
1002+ batch_bool_constant<double , A, Values...> mask,
1003+ Mode,
1004+ requires_arch<avx>) noexcept
1005+ {
1006+ XSIMD_IF_CONSTEXPR (mask.none ())
1007+ {
1008+ return ;
1009+ }
1010+ else XSIMD_IF_CONSTEXPR (mask.all ())
1011+ {
1012+ src.store (mem, Mode {});
1013+ }
1014+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
1015+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
1016+ {
1017+ constexpr auto mlo = mask.template lower_half <sse2>();
1018+ const batch<double , sse2> lo (_mm256_castpd256_pd128 (src));
1019+ store_masked<sse2>(mem, lo, mlo, Mode {}, sse4_2 {});
1020+ }
1021+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
1022+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
1023+ {
1024+ constexpr auto mhi = mask.template upper_half <sse2>();
1025+ const batch<double , sse2> hi (_mm256_extractf128_pd (src, 1 ));
1026+ store_masked<sse2>(mem + 2 , hi, mhi, Mode {}, sse4_2 {});
1027+ }
1028+ else
1029+ {
1030+ _mm256_maskstore_pd (mem, mask.as_batch (), src);
1031+ }
1032+ }
1033+
8681034 // lt
8691035 template <class A >
8701036 XSIMD_INLINE batch_bool<float , A> lt (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) noexcept
0 commit comments