Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/api/data_transfer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Data transfer
From memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load` | load values from memory |
| :cpp:func:`load` | load values from memory (optionally masked) |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`load_aligned` | load values from aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand All @@ -30,7 +30,7 @@ From a scalar:
To memory:

+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store` | store values to memory |
| :cpp:func:`store` | store values to memory (optionally masked) |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`store_aligned` | store values to aligned memory |
+---------------------------------------+----------------------------------------------------+
Expand Down
1 change: 1 addition & 0 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <limits>
#include <type_traits>

#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_common_details.hpp"

namespace xsimd
Expand Down
97 changes: 97 additions & 0 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define XSIMD_COMMON_MEMORY_HPP

#include <algorithm>
#include <array>
#include <complex>
#include <stdexcept>

Expand Down Expand Up @@ -348,6 +349,102 @@ namespace xsimd
return detail::load_unaligned<A>(mem, cvt, common {}, detail::conversion_type<A, T_in, T_out> {});
}

template <class A, class T>
XSIMD_INLINE batch<T, A> load(T const* mem, aligned_mode, requires_arch<A>) noexcept
{
return load_aligned<A>(mem, convert<T> {}, A {});
}

template <class A, class T>
XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept
{
return load_unaligned<A>(mem, convert<T> {}, A {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE batch<T_out, A>
load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...>, convert<T_out>, alignment, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T_out, A>::size;
alignas(A::alignment()) std::array<T_out, size> buffer {};
constexpr std::array<bool, size> mask { Values... };

for (std::size_t i = 0; i < size; ++i)
buffer[i] = mask[i] ? static_cast<T_out>(mem[i]) : T_out(0);

return batch<T_out, A>::load(buffer.data(), aligned_mode {});
}

template <class A, class T_in, class T_out, bool... Values, class alignment>
XSIMD_INLINE void
store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
{
constexpr std::size_t size = batch<T_in, A>::size;
constexpr std::array<bool, size> mask { Values... };

for (std::size_t i = 0; i < size; ++i)
if (mask[i])
{
mem[i] = static_cast<T_out>(src.get(i));
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<int32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<uint32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<has_simd_register<double, A>::value, batch<int64_t, A>>::type
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<int64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<has_simd_register<double, A>::value, batch<uint64_t, A>>::type
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<uint64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<has_simd_register<double, A>::value, void>::type
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE typename std::enable_if<has_simd_register<double, A>::value, void>::type
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
}

// rotate_right
template <size_t N, class A, class T>
XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
Expand Down
172 changes: 163 additions & 9 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,35 @@ namespace xsimd

namespace detail
{
XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
XSIMD_INLINE __m128i lower_half(__m256i self) noexcept
{
low = _mm256_castsi256_si128(val);
high = _mm256_extractf128_si256(val, 1);
return _mm256_castsi256_si128(self);
}
XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
XSIMD_INLINE __m128 lower_half(__m256 self) noexcept
{
low = _mm256_castps256_ps128(val);
high = _mm256_extractf128_ps(val, 1);
return _mm256_castps256_ps128(self);
}
XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
XSIMD_INLINE __m128d lower_half(__m256d self) noexcept
{
low = _mm256_castpd256_pd128(val);
high = _mm256_extractf128_pd(val, 1);
return _mm256_castpd256_pd128(self);
}
XSIMD_INLINE __m128i upper_half(__m256i self) noexcept
{
return _mm256_extractf128_si256(self, 1);
}
XSIMD_INLINE __m128 upper_half(__m256 self) noexcept
{
return _mm256_extractf128_ps(self, 1);
}
XSIMD_INLINE __m128d upper_half(__m256d self) noexcept
{
return _mm256_extractf128_pd(self, 1);
}
template <class Full, class Half>
XSIMD_INLINE void split_avx(Full val, Half& low, Half& high) noexcept
{
low = lower_half(val);
high = upper_half(val);
}
XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
{
Expand All @@ -63,6 +78,17 @@ namespace xsimd
{
return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
}
template <class T>
XSIMD_INLINE batch<T, sse4_2> lower_half(batch<T, avx> const& self) noexcept
{
return lower_half(self);
}
template <class T>
XSIMD_INLINE batch<T, sse4_2> upper_half(batch<T, avx> const& self) noexcept
{
return upper_half(self);
}

template <class F>
XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
{
Expand Down Expand Up @@ -865,6 +891,134 @@ namespace xsimd
return _mm256_loadu_pd(mem);
}

// load_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return _mm256_setzero_ps();
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
// confined to lower 128-bit half (4 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
{
constexpr auto mlo = mask.template lower_half<sse4_2>();
const auto lo = load_masked(mem, mlo, convert<float> {}, Mode {}, sse4_2 {});
return batch<float, A>(detail::merge_sse(lo, batch<float, sse4_2>(0.f)));
}
// confined to upper 128-bit half (4 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 4)
{
constexpr auto mhi = mask.template upper_half<sse4_2>();
const auto hi = load_masked(mem + 4, mhi, convert<float> {}, Mode {}, sse4_2 {});
return batch<float, A>(detail::merge_sse(batch<float, sse4_2>(0.f), hi));
}
else
{
// crossing 128-bit boundary → use 256-bit masked load
return _mm256_maskload_ps(mem, mask.as_batch());
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return _mm256_setzero_pd();
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
return load<A>(mem, Mode {});
}
// confined to lower 128-bit half (2 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 2)
{
constexpr auto mlo = mask.template lower_half<sse4_2>();
const auto lo = load_masked(mem, mlo, convert<double> {}, Mode {}, sse4_2 {});
return batch<double, A>(detail::merge_sse(lo, batch<double, sse4_2>(0.0)));
}
// confined to upper 128-bit half (2 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 2)
{
constexpr auto mhi = mask.template upper_half<sse4_2>();
const auto hi = load_masked(mem + 2, mhi, convert<double> {}, Mode {}, sse4_2 {});
return batch<double, A>(detail::merge_sse(batch<double, sse4_2>(0.0), hi));
}
else
{
// crossing 128-bit boundary → use 256-bit masked load
return _mm256_maskload_pd(mem, mask.as_batch());
}
}

// store_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
// confined to lower 128-bit half (4 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4)
{
constexpr auto mlo = mask.template lower_half<sse4_2>();
const batch<float, sse4_2> lo(_mm256_castps256_ps128(src));
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half (4 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 4)
{
constexpr auto mhi = mask.template upper_half<sse4_2>();
const batch<float, sse4_2> hi(_mm256_extractf128_ps(src, 1));
store_masked<sse4_2>(mem + 4, hi, mhi, Mode {}, sse4_2 {});
}
else
{
_mm256_maskstore_ps(mem, mask.as_batch(), src);
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
XSIMD_IF_CONSTEXPR(mask.none())
{
return;
}
else XSIMD_IF_CONSTEXPR(mask.all())
{
src.store(mem, Mode {});
}
// confined to lower 128-bit half (2 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 2)
{
constexpr auto mlo = mask.template lower_half<sse2>();
const batch<double, sse2> lo(_mm256_castpd256_pd128(src));
store_masked<sse2>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half (2 lanes) → forward to SSE2
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 2)
{
constexpr auto mhi = mask.template upper_half<sse2>();
const batch<double, sse2> hi(_mm256_extractf128_pd(src, 1));
store_masked<sse2>(mem + 2, hi, mhi, Mode {}, sse4_2 {});
}
else
{
_mm256_maskstore_pd(mem, mask.as_batch(), src);
}
}

// lt
template <class A>
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
Expand Down
Loading
Loading