Skip to content

Commit 6f7ee16

Browse files
Implement optimized movemasks for NEON (#1236)
Based on the implementation from https://github.com/DLTcollab/sse2neon/
1 parent fa3d5f9 commit 6f7ee16

File tree

4 files changed

+92
-16
lines changed

4 files changed

+92
-16
lines changed

include/xsimd/arch/common/xsimd_common_logical.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,7 @@ namespace xsimd
7272
XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<common>) noexcept
7373
{
7474
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
75-
// This is inefficient but should never be called. It's just a
76-
// temporary implementation until arm support is added.
75+
// This is inefficient and should never be called.
7776
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
7877
buffer[i] = mask & (1ull << i);
7978
return batch_bool<T, A>::load_aligned(buffer);
@@ -204,8 +203,7 @@ namespace xsimd
204203
{
205204
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
206205
self.store_aligned(buffer);
207-
// This is inefficient but should never be called. It's just a
208-
// temporary implementation until arm support is added.
206+
// This is inefficient and should never be called.
209207
uint64_t res = 0;
210208
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
211209
if (buffer[i])

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3278,6 +3278,85 @@ namespace xsimd
32783278
return { batch<widen_t<T>, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch<widen_t<T>, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) };
32793279
}
32803280

3281+
/********
3282+
* mask *
3283+
********/
3284+
namespace detail
3285+
{
3286+
#ifdef XSIMD_LITTLE_ENDIAN
3287+
static constexpr bool do_swap = false;
3288+
#else
3289+
static constexpr bool do_swap = true;
3290+
#endif
3291+
}
3292+
3293+
template <class A, class T, detail::enable_sized_t<T, 1> = 0>
3294+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
3295+
{
3296+
// From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3297+
uint8x16_t msbs = vshrq_n_u8(self, 7);
3298+
XSIMD_IF_CONSTEXPR(detail::do_swap)
3299+
{
3300+
msbs = vrev64q_u8(msbs);
3301+
}
3302+
3303+
uint64x2_t bits = vreinterpretq_u64_u8(msbs);
3304+
bits = vsraq_n_u64(bits, bits, 7);
3305+
bits = vsraq_n_u64(bits, bits, 14);
3306+
bits = vsraq_n_u64(bits, bits, 28);
3307+
3308+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3309+
constexpr int offset = detail::do_swap ? 7 : 0;
3310+
3311+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8;
3312+
}
3313+
3314+
template <class A, class T, detail::enable_sized_t<T, 2> = 0>
3315+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
3316+
{
3317+
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3318+
uint16x8_t msbs = vshrq_n_u16(self, 15);
3319+
XSIMD_IF_CONSTEXPR(detail::do_swap)
3320+
{
3321+
msbs = vrev64q_u16(msbs);
3322+
}
3323+
3324+
uint64x2_t bits = vreinterpretq_u64_u16(msbs);
3325+
bits = vsraq_n_u64(bits, bits, 15);
3326+
bits = vsraq_n_u64(bits, bits, 30);
3327+
3328+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3329+
constexpr int offset = detail::do_swap ? 7 : 0;
3330+
3331+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4;
3332+
}
3333+
3334+
template <class A, class T, detail::enable_sized_t<T, 4> = 0>
3335+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
3336+
{
3337+
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3338+
uint32x4_t msbs = vshrq_n_u32(self, 31);
3339+
XSIMD_IF_CONSTEXPR(detail::do_swap)
3340+
{
3341+
msbs = vrev64q_u32(msbs);
3342+
}
3343+
3344+
uint64x2_t bits = vreinterpretq_u64_u32(msbs);
3345+
bits = vsraq_n_u64(bits, bits, 31);
3346+
3347+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3348+
constexpr int offset = detail::do_swap ? 7 : 0;
3349+
3350+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2;
3351+
}
3352+
3353+
template <class A, class T, detail::enable_sized_t<T, 8> = 0>
3354+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
3355+
{
3356+
uint64_t mask_lo = vgetq_lane_u64(self, 0);
3357+
uint64_t mask_hi = vgetq_lane_u64(self, 1);
3358+
return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3;
3359+
}
32813360
}
32823361

32833362
}

include/xsimd/config/xsimd_config.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,17 @@
1616
#define XSIMD_VERSION_MINOR 0
1717
#define XSIMD_VERSION_PATCH 0
1818

19+
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
20+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
21+
#define XSIMD_LITTLE_ENDIAN
22+
#endif
23+
#elif defined(_WIN32)
24+
// We can safely assume that Windows is always little endian
25+
#define XSIMD_LITTLE_ENDIAN
26+
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
27+
#define XSIMD_LITTLE_ENDIAN
28+
#endif
29+
1930
/**
2031
* high level free functions
2132
*

include/xsimd/math/xsimd_rem_pio2.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,6 @@ namespace xsimd
4747
* ====================================================
4848
*/
4949

50-
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
51-
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
52-
#define XSIMD_LITTLE_ENDIAN
53-
#endif
54-
#elif defined(_WIN32)
55-
// We can safely assume that Windows is always little endian
56-
#define XSIMD_LITTLE_ENDIAN
57-
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
58-
#define XSIMD_LITTLE_ENDIAN
59-
#endif
60-
6150
#ifdef XSIMD_LITTLE_ENDIAN
6251
#define LOW_WORD_IDX 0
6352
#define HIGH_WORD_IDX sizeof(std::uint32_t)
@@ -708,7 +697,6 @@ namespace xsimd
708697
}
709698
}
710699

711-
#undef XSIMD_LITTLE_ENDIAN
712700
#undef SET_LOW_WORD
713701
#undef SET_HIGH_WORD
714702
#undef GET_LOW_WORD

0 commit comments

Comments
 (0)