Skip to content

Commit 176b336

Browse files
committed
fix stupid shuffle issues, clang-format
1 parent d897cf9 commit 176b336

File tree

5 files changed

+94
-92
lines changed

5 files changed

+94
-92
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,8 @@ namespace xsimd
572572
}
573573

574574
// load_unaligned<batch_bool>
575-
namespace detail {
575+
namespace detail
576+
{
576577
template <class T>
577578
XSIMD_INLINE __m256i load_bool_avx2(bool const* mem) noexcept
578579
{
@@ -605,7 +606,7 @@ namespace xsimd
605606
else
606607
{
607608
assert(false && "unsupported arch/op combination");
608-
return __m256i{};
609+
return __m256i {};
609610
}
610611
}
611612
}
@@ -982,43 +983,41 @@ namespace xsimd
982983
}
983984

984985
// store<batch_bool>
985-
namespace detail {
986+
namespace detail
987+
{
986988
template <class T>
987-
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept {
989+
XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept
990+
{
988991
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
989992
// GCC/Clang/MSVC will turn it into the correct store.
990993
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
991994
{
992995
// negate mask to convert to 0 or 1
993996
auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b);
994997
memcpy(mem, &val, sizeof(val));
998+
return;
995999
}
996-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1000+
1001+
auto b_hi = _mm256_extractf128_si256(b, 1);
1002+
auto b_lo = _mm256_castsi256_si128(b);
1003+
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
9971004
{
998-
auto packed = _mm256_castsi256_si128(_mm256_packs_epi16(b, b));
999-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), packed);
1005+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi));
10001006
memcpy(mem, &val, sizeof(val));
10011007
}
10021008
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
10031009
{
1004-
auto bmask = _mm256_set_epi8(
1005-
-1, -1, -1, -1, -1, -1, -1, -1,
1006-
-1, -1, -1, -1, -1, -1, -1, -1,
1007-
-1, -1, -1, -1, -1, -1, -1, -1,
1008-
28, 24, 20, 16, 12, 8, 4, 0);
1009-
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
1010-
auto val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), packed));
1010+
auto pack_16 = _mm_packs_epi32(b_lo, b_hi);
1011+
auto val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
10111012
memcpy(mem, &val, sizeof(val));
10121013
}
10131014
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
10141015
{
1015-
auto bmask = _mm256_set_epi8(
1016-
-1, -1, -1, -1, -1, -1, -1, -1,
1017-
-1, -1, -1, -1, -1, -1, -1, -1,
1016+
const auto bmask = _mm_set_epi8(
10181017
-1, -1, -1, -1, -1, -1, -1, -1,
1019-
-1, -1, -1, -1, 24, 16, 8, 0);
1020-
auto packed = _mm256_castsi256_si128(_mm256_shuffle_epi8(b, bmask));
1021-
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
1018+
-1, -1, -1, -1, -1, -1, 8, 0);
1019+
auto pack = _mm_unpacklo_epi16(_mm_shuffle_epi8(b_lo, bmask), _mm_shuffle_epi8(b_hi, bmask));
1020+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), pack));
10221021
memcpy(mem, &val, sizeof(val));
10231022
}
10241023
else
@@ -1027,15 +1026,15 @@ namespace xsimd
10271026
}
10281027
}
10291028

1030-
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }
1029+
XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); }
10311030
XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); }
10321031
XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; }
10331032
}
10341033

10351034
template <class T, class A>
10361035
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<avx2>) noexcept
10371036
{
1038-
detail::store_bool_avx2(detail::avx_to_i(b), mem, T{});
1037+
detail::store_bool_avx2(detail::avx_to_i(b), mem, T {});
10391038
}
10401039

10411040
// ssub

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1697,6 +1697,55 @@ namespace xsimd
16971697
}
16981698
}
16991699

1700+
// store<batch_bool>
1701+
namespace detail
1702+
{
1703+
template <class T>
1704+
XSIMD_INLINE void store_bool_sse2(__m128i b, bool* mem, T) noexcept
1705+
{
1706+
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
1707+
// GCC/Clang/MSVC will turn it into the correct store.
1708+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1709+
{
1710+
// negate mask to convert to 0 or 1
1711+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
1712+
memcpy(mem, &val, sizeof(val));
1713+
}
1714+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1715+
{
1716+
uint64_t val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b)));
1717+
memcpy(mem, &val, sizeof(val));
1718+
}
1719+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1720+
{
1721+
auto pack_16 = _mm_packs_epi32(b, b);
1722+
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1723+
memcpy(mem, &val, sizeof(val));
1724+
}
1725+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1726+
{
1727+
auto pack_32 = _mm_packs_epi32(b, b);
1728+
auto pack_16 = _mm_packs_epi32(pack_32, pack_32);
1729+
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)));
1730+
memcpy(mem, &val, sizeof(val));
1731+
}
1732+
else
1733+
{
1734+
assert(false && "unsupported arch/op combination");
1735+
}
1736+
}
1737+
1738+
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
1739+
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
1740+
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
1741+
}
1742+
1743+
template <class T, class A>
1744+
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse2>) noexcept
1745+
{
1746+
detail::store_bool_sse2(detail::sse_to_i(b), mem, T {});
1747+
}
1748+
17001749
// store_aligned
17011750
template <class A>
17021751
XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,37 +123,40 @@ namespace xsimd
123123
}
124124

125125
// load_unaligned<batch_bool>
126-
namespace detail {
126+
namespace detail
127+
{
127128
template <class T>
128129
XSIMD_INLINE __m128i load_bool_sse4_1(bool const* mem) noexcept
129130
{
130131
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
131132
{
132-
auto maskz = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i const*)mem), _mm_set1_epi8(0));
133-
return _mm_xor_si128(maskz, _mm_set1_epi8(-1));
133+
return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem));
134134
}
135135
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
136136
// GCC/Clang/MSVC will turn it into the correct load.
137137
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
138138
{
139139
uint64_t tmp;
140140
memcpy(&tmp, mem, sizeof(tmp));
141-
auto bpack = _mm_cvtsi64_si128(tmp);
142-
return _mm_cmpgt_epi16(_mm_cvtepu8_epi16(bpack), _mm_set1_epi16(0));
141+
return _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(_mm_cvtsi64_si128(tmp)));
143142
}
144143
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
145144
{
146145
uint32_t tmp;
147146
memcpy(&tmp, mem, sizeof(tmp));
148147
auto bpack = _mm_cvtsi32_si128(tmp);
149-
return _mm_cmpgt_epi32(_mm_cvtepu8_epi32(bpack), _mm_set1_epi32(0));
148+
return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp)));
150149
}
151150
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
152151
{
153152
uint16_t tmp;
154153
memcpy(&tmp, mem, sizeof(tmp));
155-
auto bpack = _mm_cvtsi32_si128((uint32_t)tmp);
156-
return _mm_cmpgt_epi64(_mm_cvtepu8_epi64(bpack), _mm_set1_epi64x(0));
154+
return _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp)));
155+
}
156+
else
157+
{
158+
assert(false && "unsupported arch/op combination");
159+
return __m128i {};
157160
}
158161
}
159162
}

include/xsimd/arch/xsimd_ssse3.hpp

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -128,60 +128,6 @@ namespace xsimd
128128
return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
129129
}
130130

131-
// store<batch_bool>
132-
namespace detail {
133-
template <class T>
134-
XSIMD_INLINE void store_bool_ssse3(__m128i b, bool* mem, T) noexcept
135-
{
136-
// GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this.
137-
// GCC/Clang/MSVC will turn it into the correct store.
138-
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
139-
{
140-
// negate mask to convert to 0 or 1
141-
auto val = _mm_sub_epi8(_mm_set1_epi8(0), b);
142-
memcpy(mem, &val, sizeof(val));
143-
}
144-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
145-
{
146-
auto packed = _mm_packs_epi16(b, b);
147-
uint64_t val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), packed));
148-
memcpy(mem, &val, sizeof(val));
149-
}
150-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
151-
{
152-
const auto bmask = _mm_set_epi8(
153-
-1, -1, -1, -1, -1, -1, -1, -1,
154-
-1, -1, -1, -1, 12, 8, 4, 0);
155-
auto packed = _mm_shuffle_epi8(b, bmask);
156-
uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
157-
memcpy(mem, &val, sizeof(val));
158-
}
159-
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
160-
{
161-
const auto bmask = _mm_set_epi8(
162-
-1, -1, -1, -1, -1, -1, -1, -1,
163-
-1, -1, -1, -1, -1, -1, 8, 0);
164-
auto packed = _mm_shuffle_epi8(b, bmask);
165-
uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), packed));
166-
memcpy(mem, &val, sizeof(val));
167-
}
168-
else
169-
{
170-
assert(false && "unsupported arch/op combination");
171-
}
172-
}
173-
174-
XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); }
175-
XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); }
176-
XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; }
177-
}
178-
179-
template <class T, class A>
180-
XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<sse3>) noexcept
181-
{
182-
detail::store_bool_ssse3(detail::sse_to_i(b), mem, T{});
183-
}
184-
185131
// swizzle (dynamic mask)
186132
template <class A>
187133
XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept

test/test_batch_bool.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ struct batch_bool_test
200200
template <typename F>
201201
static batch_bool_type make_batch(F&& f)
202202
{
203-
return make_batch_impl(std::forward<F>(f), std::integral_constant<size_t, size>(), pack<>{});
203+
return make_batch_impl(std::forward<F>(f), std::integral_constant<size_t, size>(), pack<> {});
204204
}
205205

206206
void test_constructors() const
@@ -223,10 +223,12 @@ struct batch_bool_test
223223
}
224224

225225
{
226-
auto f_bool = [](size_t i) { return bool(i % 3); };
226+
auto f_bool = [](size_t i)
227+
{ return bool(i % 3); };
227228

228229
bool_array_type res;
229-
for (size_t i = 0; i < res.size(); i++) {
230+
for (size_t i = 0; i < res.size(); i++)
231+
{
230232
res[i] = f_bool(i);
231233
}
232234

@@ -260,9 +262,11 @@ struct batch_bool_test
260262
// load/store, almost all false
261263
{
262264
size_t i = 0;
263-
for (const auto& vec : bool_g.almost_all_false()) {
265+
for (const auto& vec : bool_g.almost_all_false())
266+
{
264267
batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
265-
batch_bool_type expected = make_batch([i](size_t x) { return x == i; });
268+
batch_bool_type expected = make_batch([i](size_t x)
269+
{ return x == i; });
266270
i++;
267271
CHECK_UNARY(xsimd::all(b == expected));
268272
b.store_unaligned(res.data());
@@ -274,13 +278,14 @@ struct batch_bool_test
274278
// load/store, almost all true
275279
{
276280
size_t i = 0;
277-
for (const auto& vec : bool_g.almost_all_true()) {
281+
for (const auto& vec : bool_g.almost_all_true())
282+
{
278283
batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
279-
batch_bool_type expected = make_batch([i](size_t x) { return x != i; });
284+
batch_bool_type expected = make_batch([i](size_t x)
285+
{ return x != i; });
280286
i++;
281287
CHECK_UNARY(xsimd::all(b == expected));
282288
b.store_unaligned(res.data());
283-
CHECK_EQ(res, vec);
284289
CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0);
285290
}
286291
}

0 commit comments

Comments
 (0)