Skip to content

Commit d897cf9

Browse files
committed
fix some compatibility
1 parent 9428917 commit d897cf9

File tree

4 files changed

+121
-66
lines changed

4 files changed

+121
-66
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -574,36 +574,58 @@ namespace xsimd
574574
// load_unaligned<batch_bool>
575575
namespace detail {
576576
template <class T>
577-
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem, T) noexcept {
578-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
577+
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept
578+
{
579+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
580+
{
579581
auto maskz = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i const*)mem), _mm256_set1_epi8(0));
580582
return _mm256_xor_si256(maskz, _mm256_set1_epi8(-1));
581583
}
582-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
584+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
585+
{
583586
auto bpack = _mm_loadu_si128((__m128i const*)mem);
584587
return _mm256_cmpgt_epi16(_mm256_cvtepu8_epi16(bpack), _mm256_set1_epi16(0));
585588
}
586589
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
587590
// GCC/Clang/MSVC will turn it into the correct load.
588-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
591+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
592+
{
589593
uint64_t tmp;
590594
memcpy(&tmp, mem, sizeof(tmp));
591595
auto bpack = _mm_cvtsi64_si128(tmp);
592596
return _mm256_cmpgt_epi32(_mm256_cvtepu8_epi32(bpack), _mm256_set1_epi32(0));
593597
}
594-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
598+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
599+
{
595600
uint32_t tmp;
596601
memcpy(&tmp, mem, sizeof(tmp));
597602
auto bpack = _mm_cvtsi32_si128(tmp);
598603
return _mm256_cmpgt_epi64(_mm256_cvtepu8_epi64(bpack), _mm256_set1_epi64x(0));
599604
}
605+
else
606+
{
607+
assert(false && "unsupported arch/op combination");
608+
return __m256i{};
609+
}
600610
}
601611
}
602612

603-
template <class T, class A>
613+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
604614
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
605615
{
606-
return batch_bool_cast<T, A>(detail::load_bool_avx2(mem, T{}), avx2{});
616+
return batch_bool<T, A>(detail::load_bool_avx2<T>(mem));
617+
}
618+
619+
template <class A>
620+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<avx2>) noexcept
621+
{
622+
return batch_bool<float, A>(_mm256_castsi256_ps(detail::load_bool_avx2<float>(mem)));
623+
}
624+
625+
template <class A>
626+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<avx2>) noexcept
627+
{
628+
return batch_bool<double, A>(_mm256_castsi256_pd(detail::load_bool_avx2<double>(mem)));
607629
}
608630

609631
// mask
@@ -965,36 +987,44 @@ namespace xsimd
965987
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept {
966988
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
967989
// GCC/Clang/MSVC will turn it into the correct store.
968-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
990+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
991+
{
969992
// negate mask to convert to 0 or 1
970993
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
971994
memcpy(mem, &val, sizeof(val));
972995
}
973-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
996+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
997+
{
974998
auto packed = _mm256_castsi256_si128(_mm256_packs_epi16(b, b));
975999
auto val = _mm_sub_epi8(_mm_set1_epi8(0), packed);
9761000
memcpy(mem, &val, sizeof(val));
9771001
}
978-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
1002+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1003+
{
9791004
auto bmask = _mm256_set_epi8(
9801005
-1, -1, -1, -1, -1, -1, -1, -1,
9811006
-1, -1, -1, -1, -1, -1, -1, -1,
9821007
-1, -1, -1, -1, -1, -1, -1, -1,
9831008
28, 24, 20, 16, 12, 8, 4, 0);
9841009
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
985-
auto val = _mm_extract_epi64(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
1010+
auto val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), packed));
9861011
memcpy(mem, &val, sizeof(val));
9871012
}
988-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
1013+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1014+
{
9891015
auto bmask = _mm256_set_epi8(
9901016
-1, -1, -1, -1, -1, -1, -1, -1,
9911017
-1, -1, -1, -1, -1, -1, -1, -1,
9921018
-1, -1, -1, -1, -1, -1, -1, -1,
9931019
-1, -1, -1, -1, 24, 16, 8, 0);
9941020
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
995-
uint32_t val = _mm_extract_epi32(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
1021+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
9961022
memcpy(mem, &val, sizeof(val));
9971023
}
1024+
else
1025+
{
1026+
assert(false && "unsupported arch/op combination");
1027+
}
9981028
}
9991029

10001030
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }

include/xsimd/arch/xsimd_sse3.hpp

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -59,52 +59,6 @@ namespace xsimd
5959
__m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1));
6060
return _mm_cvtss_f32(tmp2);
6161
}
62-
63-
// store<batch_bool>
64-
namespace detail {
65-
template <class T>
66-
XSIMD_INLINE void store_bool_sse3(__m128i b, bool* mem, T) noexcept {
67-
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
68-
// GCC/Clang/MSVC will turn it into the correct store.
69-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
70-
// negate mask to convert to 0 or 1
71-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
72-
memcpy(mem, &val, sizeof(val));
73-
}
74-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
75-
auto packed = _mm_packs_epi16(b, b);
76-
uint64_t val = _mm_extract_epi64(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
77-
memcpy(mem, &val, sizeof(val));
78-
}
79-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
80-
const auto bmask = _mm_set_epi8(
81-
-1, -1, -1, -1, -1, -1, -1, -1,
82-
-1, -1, -1, -1, 12, 8, 4, 0);
83-
auto packed = _mm_shuffle_epi8(b, bmask);
84-
uint32_t val = _mm_extract_epi32(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
85-
memcpy(mem, &val, sizeof(val));
86-
}
87-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
88-
const auto bmask = _mm_set_epi8(
89-
-1, -1, -1, -1, -1, -1, -1, -1,
90-
-1, -1, -1, -1, -1, -1, 8, 0);
91-
auto packed = _mm_shuffle_epi8(b, bmask);
92-
uint16_t val = _mm_extract_epi16(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
93-
memcpy(mem, &val, sizeof(val));
94-
}
95-
}
96-
97-
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
98-
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
99-
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
100-
}
101-
102-
template <class T, class A>
103-
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse3>) noexcept
104-
{
105-
detail::store_bool_sse3(detail::sse_to_i(b), mem, T{});
106-
}
107-
10862
}
10963

11064
}

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -125,26 +125,31 @@ namespace xsimd
125125
// load_unaligned<batch_bool>
126126
namespace detail {
127127
template <class T>
128-
XSIMD_INLINE __m128i load_bool_sse4(bool const* mem, T) noexcept {
129-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
128+
XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept
129+
{
130+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
131+
{
130132
auto maskz = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i const*)mem), _mm_set1_epi8(0));
131133
return _mm_xor_si128(maskz, _mm_set1_epi8(-1));
132134
}
133135
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
134136
// GCC/Clang/MSVC will turn it into the correct load.
135-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
137+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
138+
{
136139
uint64_t tmp;
137140
memcpy(&tmp, mem, sizeof(tmp));
138141
auto bpack = _mm_cvtsi64_si128(tmp);
139142
return _mm_cmpgt_epi16(_mm_cvtepu8_epi16(bpack), _mm_set1_epi16(0));
140143
}
141-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
144+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
145+
{
142146
uint32_t tmp;
143147
memcpy(&tmp, mem, sizeof(tmp));
144148
auto bpack = _mm_cvtsi32_si128(tmp);
145149
return _mm_cmpgt_epi32(_mm_cvtepu8_epi32(bpack), _mm_set1_epi32(0));
146150
}
147-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
151+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
152+
{
148153
uint16_t tmp;
149154
memcpy(&tmp, mem, sizeof(tmp));
150155
auto bpack = _mm_cvtsi32_si128((uint32_t)tmp);
@@ -153,10 +158,22 @@ namespace xsimd
153158
}
154159
}
155160

156-
template <class T, class A>
161+
template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
157162
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
158163
{
159-
return batch_bool_cast<T, A>(detail::load_bool_sse4(mem, T{}), sse4_1{});
164+
return batch_bool<T, A>(detail::load_bool_sse4_1<T>(mem));
165+
}
166+
167+
template <class A>
168+
XSIMD_INLINE batch_bool<float, A> load_unaligned(bool const* mem, batch_bool<float, A>, requires_arch<sse4_1>) noexcept
169+
{
170+
return batch_bool<float, A>(_mm_castsi128_ps(detail::load_bool_sse4_1<float>(mem)));
171+
}
172+
173+
template <class A>
174+
XSIMD_INLINE batch_bool<double, A> load_unaligned(bool const* mem, batch_bool<double, A>, requires_arch<sse4_1>) noexcept
175+
{
176+
return batch_bool<double, A>(_mm_castsi128_pd(detail::load_bool_sse4_1<double>(mem)));
160177
}
161178

162179
// max

include/xsimd/arch/xsimd_ssse3.hpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,60 @@ namespace xsimd
128128
return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
129129
}
130130

131+
// store<batch_bool>
132+
namespace detail {
133+
template <class T>
134+
XSIMD_INLINE void store_bool_ssse3(__m128i b, bool* mem, T) noexcept
135+
{
136+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
137+
// GCC/Clang/MSVC will turn it into the correct store.
138+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
139+
{
140+
// negate mask to convert to 0 or 1
141+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
142+
memcpy(mem, &val, sizeof(val));
143+
}
144+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
145+
{
146+
auto packed = _mm_packs_epi16(b, b);
147+
uint64_t val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), packed));
148+
memcpy(mem, &val, sizeof(val));
149+
}
150+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
151+
{
152+
const auto bmask = _mm_set_epi8(
153+
-1, -1, -1, -1, -1, -1, -1, -1,
154+
-1, -1, -1, -1, 12, 8, 4, 0);
155+
auto packed = _mm_shuffle_epi8(b, bmask);
156+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
157+
memcpy(mem, &val, sizeof(val));
158+
}
159+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
160+
{
161+
const auto bmask = _mm_set_epi8(
162+
-1, -1, -1, -1, -1, -1, -1, -1,
163+
-1, -1, -1, -1, -1, -1, 8, 0);
164+
auto packed = _mm_shuffle_epi8(b, bmask);
165+
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
166+
memcpy(mem, &val, sizeof(val));
167+
}
168+
else
169+
{
170+
assert(false && "unsupported arch/op combination");
171+
}
172+
}
173+
174+
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
175+
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
176+
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
177+
}
178+
179+
template <class T, class A>
180+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse3>) noexcept
181+
{
182+
detail::store_bool_ssse3(detail::sse_to_i(b), mem, T{});
183+
}
184+
131185
// swizzle (dynamic mask)
132186
template <class A>
133187
XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept

0 commit comments

Comments
 (0)