Skip to content

Commit 8bf44e4

Browse files
Reformat store bool to match other function's style
1 parent 62a008c commit 8bf44e4

File tree

2 files changed

+80
-82
lines changed

2 files changed

+80
-82
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -978,59 +978,58 @@ namespace xsimd
978978
}
979979

980980
// store<batch_bool>
981-
namespace detail
981+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
982+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<avx2>) noexcept
982983
{
983-
template <class T>
984-
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept
984+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
985+
// GCC/Clang/MSVC will turn it into the correct store.
986+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
987+
{
988+
// negate mask to convert to 0 or 1
989+
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
990+
memcpy(mem, &val, sizeof(val));
991+
}
992+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
985993
{
986-
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
987-
// GCC/Clang/MSVC will turn it into the correct store.
988-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
989-
{
990-
// negate mask to convert to 0 or 1
991-
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
992-
memcpy(mem, &val, sizeof(val));
993-
return;
994-
}
995-
996994
auto b_hi = _mm256_extractf128_si256(b, 1);
997995
auto b_lo = _mm256_castsi256_si128(b);
998-
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
999-
{
1000-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi));
1001-
memcpy(mem, &val, sizeof(val));
1002-
}
1003-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1004-
{
1005-
auto pack_16 = _mm_packs_epi32(b_lo, b_hi);
1006-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16));
996+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi));
997+
memcpy(mem, &val, sizeof(val));
998+
}
999+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1000+
{
1001+
auto b_hi = _mm256_extractf128_si256(b, 1);
1002+
auto b_lo = _mm256_castsi256_si128(b);
1003+
auto pack_16 = _mm_packs_epi32(b_lo, b_hi);
1004+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16));
10071005
#if defined(__x86_64__)
1008-
auto val_lo = _mm_cvtsi128_si64(val);
1009-
memcpy(mem, &val_lo, sizeof(val_lo));
1006+
auto val_lo = _mm_cvtsi128_si64(val);
1007+
memcpy(mem, &val_lo, sizeof(val_lo));
10101008
#else
1011-
memcpy(mem, &val, sizeof(uint64_t));
1009+
memcpy(mem, &val, sizeof(uint64_t));
10121010
#endif
1013-
}
1014-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1015-
{
1016-
uint32_t mask = _mm256_movemask_epi8(_mm256_srli_epi64(b, 56));
1017-
memcpy(mem, &mask, sizeof(mask));
1018-
}
1019-
else
1020-
{
1021-
assert(false && "unsupported arch/op combination");
1022-
}
10231011
}
1012+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1013+
{
1014+
uint32_t mask = _mm256_movemask_epi8(_mm256_srli_epi64(b, 56));
1015+
memcpy(mem, &mask, sizeof(mask));
1016+
}
1017+
else
1018+
{
1019+
assert(false && "unsupported arch/op combination");
1020+
}
1021+
}
10241022

1025-
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }
1026-
XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); }
1027-
XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; }
1023+
template <class A>
1024+
XSIMD_INLINE void store(batch_bool<float, A> b, bool* mem, requires_arch<avx2>) noexcept
1025+
{
1026+
return store(batch_bool<uint32_t, A>(b.data), mem, A {});
10281027
}
10291028

1030-
template <class T, class A>
1031-
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<avx2>) noexcept
1029+
template <class A>
1030+
XSIMD_INLINE void store(batch_bool<double, A> b, bool* mem, requires_arch<avx2>) noexcept
10321031
{
1033-
detail::store_bool_avx2(detail::avx_to_i(b), mem, T {});
1032+
return store(batch_bool<uint64_t, A>(b.data), mem, A {});
10341033
}
10351034

10361035
// ssub

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1718,57 +1718,56 @@ namespace xsimd
17181718
}
17191719

17201720
// store<batch_bool>
1721-
namespace detail
1721+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1722+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse2>) noexcept
17221723
{
1723-
template <class T>
1724-
XSIMD_INLINE void store_bool_sse2(__m128i b, bool* mem, T) noexcept
1724+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1725+
// GCC/Clang/MSVC will turn it into the correct store.
1726+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
17251727
{
1726-
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1727-
// GCC/Clang/MSVC will turn it into the correct store.
1728-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1729-
{
1730-
// negate mask to convert to 0 or 1
1731-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
1732-
memcpy(mem, &val, sizeof(val));
1733-
}
1734-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1735-
{
1736-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b));
1728+
// negate mask to convert to 0 or 1
1729+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
1730+
memcpy(mem, &val, sizeof(val));
1731+
}
1732+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1733+
{
1734+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b));
17371735
#if defined(__x86_64__)
1738-
auto val_lo = _mm_cvtsi128_si64(val);
1739-
memcpy(mem, &val_lo, sizeof(val_lo));
1736+
auto val_lo = _mm_cvtsi128_si64(val);
1737+
memcpy(mem, &val_lo, sizeof(val_lo));
17401738
#else
1741-
memcpy(mem, &val, sizeof(uint64_t));
1739+
memcpy(mem, &val, sizeof(uint64_t));
17421740
#endif
1743-
}
1744-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1745-
{
1746-
auto pack_16 = _mm_packs_epi32(b, b);
1747-
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1748-
memcpy(mem, &val, sizeof(val));
1749-
}
1750-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1751-
{
1752-
auto pack_32 = _mm_packs_epi32(b, b);
1753-
auto pack_16 = _mm_packs_epi32(pack_32, pack_32);
1754-
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1755-
memcpy(mem, &val, sizeof(val));
1756-
}
1757-
else
1758-
{
1759-
assert(false && "unsupported arch/op combination");
1760-
}
17611741
}
1742+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1743+
{
1744+
auto pack_16 = _mm_packs_epi32(b, b);
1745+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1746+
memcpy(mem, &val, sizeof(val));
1747+
}
1748+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1749+
{
1750+
auto pack_32 = _mm_packs_epi32(b, b);
1751+
auto pack_16 = _mm_packs_epi32(pack_32, pack_32);
1752+
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1753+
memcpy(mem, &val, sizeof(val));
1754+
}
1755+
else
1756+
{
1757+
assert(false && "unsupported arch/op combination");
1758+
}
1759+
}
17621760

1763-
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
1764-
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
1765-
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
1761+
template <class A>
1762+
XSIMD_INLINE void store(batch_bool<float, A> b, bool* mem, requires_arch<sse2>) noexcept
1763+
{
1764+
store(batch_bool<uint32_t, A>(b.data), mem, A {});
17661765
}
17671766

1768-
template <class T, class A>
1769-
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse2>) noexcept
1767+
template <class A>
1768+
XSIMD_INLINE void store(batch_bool<double, A> b, bool* mem, requires_arch<sse2>) noexcept
17701769
{
1771-
detail::store_bool_sse2(detail::sse_to_i(b), mem, T {});
1770+
store(batch_bool<uint64_t, A>(b.data), mem, A {});
17721771
}
17731772

17741773
// store_aligned

0 commit comments

Comments
 (0)