Skip to content

Commit ebf7519

Browse files
committed
Use SSE2NEON movemask kernels
1 parent d205b15 commit ebf7519

File tree

1 file changed

+32
-27
lines changed

1 file changed

+32
-27
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3293,56 +3293,61 @@ namespace xsimd
32933293
template <class A, class T, detail::enable_sized_t<T, 1> = 0>
32943294
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
32953295
{
3296-
uint8x16_t inner = self;
3296+
// From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3297+
uint8x16_t msbs = vshrq_n_u8(self, 7);
32973298
XSIMD_IF_CONSTEXPR(detail::do_swap)
32983299
{
3299-
inner = vrev16q_u8(inner);
3300+
msbs = vrev64q_u8(msbs);
33003301
}
33013302

3302-
uint16x8_t pairs = vreinterpretq_u16_u8(inner);
3303-
uint8x8_t narrowed = vshrn_n_u16(pairs, 4);
3304-
XSIMD_IF_CONSTEXPR(detail::do_swap)
3305-
{
3306-
narrowed = vrev64_u8(narrowed);
3307-
}
3303+
uint64x2_t bits = vreinterpretq_u64_u8(msbs);
3304+
bits = vsraq_n_u64(bits, bits, 7);
3305+
bits = vsraq_n_u64(bits, bits, 14);
3306+
bits = vsraq_n_u64(bits, bits, 28);
3307+
3308+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3309+
constexpr int offset = detail::do_swap ? 7 : 0;
33083310

3309-
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);
3310-
mask &= 0x1111111111111111;
3311-
mask = mask | mask >> 3;
3312-
mask = (mask | mask >> 6) & 0x000F000F000F000F;
3313-
mask = (mask | mask >> 12) & 0x000000FF000000FF;
3314-
return (mask | mask >> 24) & 0xFFFF;
3311+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8;
33153312
}
33163313

33173314
template <class A, class T, detail::enable_sized_t<T, 2> = 0>
33183315
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
33193316
{
3320-
uint8x8_t narrowed = vmovn_u16(self);
3317+
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3318+
uint16x8_t msbs = vshrq_n_u16(self, 15);
33213319
XSIMD_IF_CONSTEXPR(detail::do_swap)
33223320
{
3323-
narrowed = vrev64_u8(narrowed);
3321+
msbs = vrev64q_u16(msbs);
33243322
}
33253323

3326-
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);
3327-
mask &= 0x0101010101010101;
3328-
mask = mask | mask >> 7;
3329-
mask = mask | mask >> 14;
3330-
return (mask | mask >> 28) & 0xFF;
3324+
uint64x2_t bits = vreinterpretq_u64_u16(msbs);
3325+
bits = vsraq_n_u64(bits, bits, 15);
3326+
bits = vsraq_n_u64(bits, bits, 30);
3327+
3328+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3329+
constexpr int offset = detail::do_swap ? 7 : 0;
3330+
3331+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4;
33313332
}
33323333

33333334
template <class A, class T, detail::enable_sized_t<T, 4> = 0>
33343335
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
33353336
{
3336-
uint16x4_t narrowed = vmovn_u32(self);
3337+
// Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3338+
uint32x4_t msbs = vshrq_n_u32(self, 31);
33373339
XSIMD_IF_CONSTEXPR(detail::do_swap)
33383340
{
3339-
narrowed = vrev64_u16(narrowed);
3341+
msbs = vrev64q_u32(msbs);
33403342
}
33413343

3342-
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(narrowed), 0);
3343-
mask &= 0x0001000100010001;
3344-
mask = mask | mask >> 15;
3345-
return (mask | mask >> 30) & 0xF;
3344+
uint64x2_t bits = vreinterpretq_u64_u32(msbs);
3345+
bits = vsraq_n_u64(bits, bits, 31);
3346+
3347+
uint8x16_t output = vreinterpretq_u8_u64(bits);
3348+
constexpr int offset = detail::do_swap ? 7 : 0;
3349+
3350+
return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2;
33463351
}
33473352

33483353
template <class A, class T, detail::enable_sized_t<T, 8> = 0>

0 commit comments

Comments
 (0)