Skip to content

Commit 4a672af

Browse files
committed
Add {load,store}_unaligned for batch_bool
These work around GCC not being able to optimize the baseline implementations to SIMD operations, and compilers in general not being able to know that the vector representation is a bitmask of all 0 or all 1, thus integer 0 or -1. Also added some more robust tests for bool load/store that test for bitwise correctness (either 0 or 1).
1 parent d6150b8 commit 4a672af

File tree

4 files changed

+238
-26
lines changed

4 files changed

+238
-26
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,42 @@ namespace xsimd
570570
batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
571571
return { real, imag };
572572
}
573+
574+
// load_unaligned<batch_bool>
575+
namespace detail {
576+
template <class T>
577+
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem, T) noexcept {
578+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
579+
auto maskz = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i const*)mem), _mm256_set1_epi8(0));
580+
return _mm256_xor_si256(maskz, _mm256_set1_epi8(-1));
581+
}
582+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
583+
auto bpack = _mm_loadu_si128((__m128i const*)mem);
584+
return _mm256_cmpgt_epi16(_mm256_cvtepu8_epi16(bpack), _mm256_set1_epi16(0));
585+
}
586+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
587+
// GCC/Clang/MSVC will turn it into the correct load.
588+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
589+
uint64_t tmp;
590+
memcpy(&tmp, mem, sizeof(tmp));
591+
auto bpack = _mm_cvtsi64_si128(tmp);
592+
return _mm256_cmpgt_epi32(_mm256_cvtepu8_epi32(bpack), _mm256_set1_epi32(0));
593+
}
594+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
595+
uint32_t tmp;
596+
memcpy(&tmp, mem, sizeof(tmp));
597+
auto bpack = _mm_cvtsi32_si128(tmp);
598+
return _mm256_cmpgt_epi64(_mm256_cvtepu8_epi64(bpack), _mm256_set1_epi64x(0));
599+
}
600+
}
601+
}
602+
603+
template <class T, class A>
604+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx2>) noexcept
605+
{
606+
return batch_bool_cast<T, A>(detail::load_bool_avx2(mem, T{}), avx2{});
607+
}
608+
573609
// mask
574610
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
575611
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
@@ -923,6 +959,55 @@ namespace xsimd
923959
return _mm256_or_si256(y, w);
924960
}
925961

962+
// store<batch_bool>
963+
namespace detail {
964+
template <class T>
965+
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept {
966+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
967+
// GCC/Clang/MSVC will turn it into the correct store.
968+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
969+
// negate mask to convert to 0 or 1
970+
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
971+
memcpy(mem, &val, sizeof(val));
972+
}
973+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
974+
auto packed = _mm256_castsi256_si128(_mm256_packs_epi16(b, b));
975+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), packed);
976+
memcpy(mem, &val, sizeof(val));
977+
}
978+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
979+
auto bmask = _mm256_set_epi8(
980+
-1, -1, -1, -1, -1, -1, -1, -1,
981+
-1, -1, -1, -1, -1, -1, -1, -1,
982+
-1, -1, -1, -1, -1, -1, -1, -1,
983+
28, 24, 20, 16, 12, 8, 4, 0);
984+
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
985+
auto val = _mm_extract_epi64(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
986+
memcpy(mem, &val, sizeof(val));
987+
}
988+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
989+
auto bmask = _mm256_set_epi8(
990+
-1, -1, -1, -1, -1, -1, -1, -1,
991+
-1, -1, -1, -1, -1, -1, -1, -1,
992+
-1, -1, -1, -1, -1, -1, -1, -1,
993+
-1, -1, -1, -1, 24, 16, 8, 0);
994+
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
995+
uint32_t val = _mm_extract_epi32(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
996+
memcpy(mem, &val, sizeof(val));
997+
}
998+
}
999+
1000+
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }
1001+
XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); }
1002+
XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; }
1003+
}
1004+
1005+
template <class T, class A>
1006+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<avx2>) noexcept
1007+
{
1008+
detail::store_bool_avx2(detail::avx_to_i(b), mem, T{});
1009+
}
1010+
9261011
// ssub
9271012
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
9281013
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_sse3.hpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,51 @@ namespace xsimd
6060
return _mm_cvtss_f32(tmp2);
6161
}
6262

63+
// store<batch_bool>
64+
namespace detail {
65+
template <class T>
66+
XSIMD_INLINE void store_bool_sse3(__m128i b, bool* mem, T) noexcept {
67+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
68+
// GCC/Clang/MSVC will turn it into the correct store.
69+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
70+
// negate mask to convert to 0 or 1
71+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
72+
memcpy(mem, &val, sizeof(val));
73+
}
74+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
75+
auto packed = _mm_packs_epi16(b, b);
76+
uint64_t val = _mm_extract_epi64(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
77+
memcpy(mem, &val, sizeof(val));
78+
}
79+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
80+
const auto bmask = _mm_set_epi8(
81+
-1, -1, -1, -1, -1, -1, -1, -1,
82+
-1, -1, -1, -1, 12, 8, 4, 0);
83+
auto packed = _mm_shuffle_epi8(b, bmask);
84+
uint32_t val = _mm_extract_epi32(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
85+
memcpy(mem, &val, sizeof(val));
86+
}
87+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
88+
const auto bmask = _mm_set_epi8(
89+
-1, -1, -1, -1, -1, -1, -1, -1,
90+
-1, -1, -1, -1, -1, -1, 8, 0);
91+
auto packed = _mm_shuffle_epi8(b, bmask);
92+
uint16_t val = _mm_extract_epi16(_mm_sub_epi8(_mm_set1_epi8(0), packed), 0);
93+
memcpy(mem, &val, sizeof(val));
94+
}
95+
}
96+
97+
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
98+
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
99+
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
100+
}
101+
102+
template <class T, class A>
103+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse3>) noexcept
104+
{
105+
detail::store_bool_sse3(detail::sse_to_i(b), mem, T{});
106+
}
107+
63108
}
64109

65110
}

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,43 @@ namespace xsimd
122122
}
123123
}
124124

125+
// load_unaligned<batch_bool>
126+
namespace detail {
127+
template <class T>
128+
XSIMD_INLINE __m128i load_bool_sse4(bool const* mem, T) noexcept {
129+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1) {
130+
auto maskz = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i const*)mem), _mm_set1_epi8(0));
131+
return _mm_xor_si128(maskz, _mm_set1_epi8(-1));
132+
}
133+
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
134+
// GCC/Clang/MSVC will turn it into the correct load.
135+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) {
136+
uint64_t tmp;
137+
memcpy(&tmp, mem, sizeof(tmp));
138+
auto bpack = _mm_cvtsi64_si128(tmp);
139+
return _mm_cmpgt_epi16(_mm_cvtepu8_epi16(bpack), _mm_set1_epi16(0));
140+
}
141+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) {
142+
uint32_t tmp;
143+
memcpy(&tmp, mem, sizeof(tmp));
144+
auto bpack = _mm_cvtsi32_si128(tmp);
145+
return _mm_cmpgt_epi32(_mm_cvtepu8_epi32(bpack), _mm_set1_epi32(0));
146+
}
147+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) {
148+
uint16_t tmp;
149+
memcpy(&tmp, mem, sizeof(tmp));
150+
auto bpack = _mm_cvtsi16_si128(tmp);
151+
return _mm_cmpgt_epi64(_mm_cvtepu8_epi64(bpack), _mm_set1_epi64x(0));
152+
}
153+
}
154+
}
155+
156+
template <class T, class A>
157+
XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<sse4_1>) noexcept
158+
{
159+
return batch_bool_cast<T, A>(detail::load_bool_sse4(mem, T{}), sse4_1{});
160+
}
161+
125162
// max
126163
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
127164
XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept

test/test_batch_bool.cpp

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -185,26 +185,22 @@ struct batch_bool_test
185185
{
186186
};
187187

188-
template <size_t... Values>
189-
void check_constructor_from_sequence(std::integral_constant<size_t, 0>, pack<Values...>) const
188+
template <typename F, size_t... Values>
189+
static batch_bool_type make_batch_impl(F&& f, std::integral_constant<size_t, 0>, pack<Values...>)
190190
{
191-
bool_array_type res = { bool(Values % 3)... };
192-
bool_array_type tmp;
193-
batch_bool_type b0(bool(Values % 3)...);
194-
b0.store_unaligned(tmp.data());
195-
INFO("batch_bool(values...)");
196-
CHECK_EQ(tmp, res);
197-
198-
batch_bool_type b1 { bool(Values % 3)... };
199-
b1.store_unaligned(tmp.data());
200-
INFO("batch_bool{values...}");
201-
CHECK_EQ(tmp, res);
191+
return batch_bool_type(bool(f(Values))...);
202192
}
203193

204-
template <size_t I, size_t... Values>
205-
void check_constructor_from_sequence(std::integral_constant<size_t, I>, pack<Values...>) const
194+
template <typename F, size_t I, size_t... Values>
195+
static batch_bool_type make_batch_impl(F&& f, std::integral_constant<size_t, I>, pack<Values...>)
206196
{
207-
return check_constructor_from_sequence(std::integral_constant<size_t, I - 1>(), pack<Values..., I>());
197+
return make_batch_impl(std::forward<F>(f), std::integral_constant<size_t, I - 1>(), pack<I - 1, Values...>());
198+
}
199+
200+
template <typename F>
201+
static batch_bool_type make_batch(F&& f)
202+
{
203+
return make_batch_impl(std::forward<F>(f), std::integral_constant<size_t, size>(), pack<>{});
208204
}
209205

210206
void test_constructors() const
@@ -213,18 +209,38 @@ struct batch_bool_test
213209
// value uninitialized, cannot test it.
214210
(void)a;
215211

216-
bool_array_type res;
217-
batch_bool_type b(true);
218-
b.store_unaligned(res.data());
219-
INFO("batch_bool{value}");
220-
CHECK_EQ(res, all_true);
212+
{
213+
bool_array_type res;
214+
batch_bool_type b(true);
215+
b.store_unaligned(res.data());
216+
INFO("batch_bool{value}");
217+
CHECK_EQ(res, all_true);
218+
219+
batch_bool_type c { true };
220+
c.store_unaligned(res.data());
221+
INFO("batch_bool{value}");
222+
CHECK_EQ(res, all_true);
223+
}
224+
225+
{
226+
auto f_bool = [](size_t i) { return bool(i % 3); };
221227

222-
batch_bool_type c { true };
223-
c.store_unaligned(res.data());
224-
INFO("batch_bool{value}");
225-
CHECK_EQ(res, all_true);
228+
bool_array_type res;
229+
for (size_t i = 0; i < res.size(); i++) {
230+
res[i] = f_bool(i);
231+
}
232+
233+
bool_array_type tmp;
234+
batch_bool_type b0 = make_batch(f_bool);
235+
b0.store_unaligned(tmp.data());
236+
INFO("batch_bool(values...)");
237+
CHECK_EQ(tmp, res);
226238

227-
check_constructor_from_sequence(std::integral_constant<size_t, size>(), pack<>());
239+
batch_bool_type b1 = make_batch(f_bool);
240+
b1.store_unaligned(tmp.data());
241+
INFO("batch_bool{values...}");
242+
CHECK_EQ(tmp, res);
243+
}
228244
}
229245

230246
void test_load_store() const
@@ -239,6 +255,35 @@ struct batch_bool_test
239255
b = batch_bool_type::load_aligned(arhs.data());
240256
b.store_aligned(ares.data());
241257
CHECK_EQ(ares, arhs);
258+
259+
auto bool_g = xsimd::get_bool<batch_bool_type> {};
260+
// load/store, almost all false
261+
{
262+
size_t i = 0;
263+
for (const auto& vec : bool_g.almost_all_false()) {
264+
batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
265+
batch_bool_type expected = make_batch([i](size_t x) { return x == i; });
266+
i++;
267+
CHECK_UNARY(xsimd::all(b == expected));
268+
b.store_unaligned(res.data());
269+
// Check that the representation is bitwise exact.
270+
CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0);
271+
}
272+
}
273+
274+
// load/store, almost all true
275+
{
276+
size_t i = 0;
277+
for (const auto& vec : bool_g.almost_all_true()) {
278+
batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
279+
batch_bool_type expected = make_batch([i](size_t x) { return x != i; });
280+
i++;
281+
CHECK_UNARY(xsimd::all(b == expected));
282+
b.store_unaligned(res.data());
283+
CHECK_EQ(res, vec);
284+
CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0);
285+
}
286+
}
242287
}
243288

244289
void test_any_all() const

0 commit comments

Comments
 (0)