@@ -3293,56 +3293,61 @@ namespace xsimd
32933293 template <class A , class T , detail::enable_sized_t <T, 1 > = 0 >
32943294 XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
32953295 {
3296- uint8x16_t inner = self;
3296+ // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3297+ uint8x16_t msbs = vshrq_n_u8 (self, 7 );
32973298 XSIMD_IF_CONSTEXPR (detail::do_swap)
32983299 {
3299- inner = vrev16q_u8 (inner );
3300+ msbs = vrev64q_u8 (msbs );
33003301 }
33013302
3302- uint16x8_t pairs = vreinterpretq_u16_u8 (inner);
3303- uint8x8_t narrowed = vshrn_n_u16 (pairs, 4 );
3304- XSIMD_IF_CONSTEXPR (detail::do_swap)
3305- {
3306- narrowed = vrev64_u8 (narrowed);
3307- }
3303+ uint64x2_t bits = vreinterpretq_u64_u8 (msbs);
3304+ bits = vsraq_n_u64 (bits, bits, 7 );
3305+ bits = vsraq_n_u64 (bits, bits, 14 );
3306+ bits = vsraq_n_u64 (bits, bits, 28 );
3307+
3308+ uint8x16_t output = vreinterpretq_u8_u64 (bits);
3309+ constexpr int offset = detail::do_swap ? 7 : 0 ;
33083310
3309- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3310- mask &= 0x1111111111111111 ;
3311- mask = mask | mask >> 3 ;
3312- mask = (mask | mask >> 6 ) & 0x000F000F000F000F ;
3313- mask = (mask | mask >> 12 ) & 0x000000FF000000FF ;
3314- return (mask | mask >> 24 ) & 0xFFFF ;
3311+ return vgetq_lane_u8 (output, offset) | vgetq_lane_u8 (output, offset + 8 ) << 8 ;
33153312 }
33163313
33173314 template <class A , class T , detail::enable_sized_t <T, 2 > = 0 >
33183315 XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
33193316 {
3320- uint8x8_t narrowed = vmovn_u16 (self);
3317+ // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3318+ uint16x8_t msbs = vshrq_n_u16 (self, 15 );
33213319 XSIMD_IF_CONSTEXPR (detail::do_swap)
33223320 {
3323- narrowed = vrev64_u8 (narrowed );
3321+ msbs = vrev64q_u16 (msbs );
33243322 }
33253323
3326- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3327- mask &= 0x0101010101010101 ;
3328- mask = mask | mask >> 7 ;
3329- mask = mask | mask >> 14 ;
3330- return (mask | mask >> 28 ) & 0xFF ;
3324+ uint64x2_t bits = vreinterpretq_u64_u16 (msbs);
3325+ bits = vsraq_n_u64 (bits, bits, 15 );
3326+ bits = vsraq_n_u64 (bits, bits, 30 );
3327+
3328+ uint8x16_t output = vreinterpretq_u8_u64 (bits);
3329+ constexpr int offset = detail::do_swap ? 7 : 0 ;
3330+
3331+ return vgetq_lane_u8 (output, offset) | vgetq_lane_u8 (output, offset + 8 ) << 4 ;
33313332 }
33323333
33333334 template <class A , class T , detail::enable_sized_t <T, 4 > = 0 >
33343335 XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
33353336 {
3336- uint16x4_t narrowed = vmovn_u32 (self);
3337+ // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
3338+ uint32x4_t msbs = vshrq_n_u32 (self, 31 );
33373339 XSIMD_IF_CONSTEXPR (detail::do_swap)
33383340 {
3339- narrowed = vrev64_u16 (narrowed );
3341+ msbs = vrev64q_u32 (msbs );
33403342 }
33413343
3342- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u16 (narrowed), 0 );
3343- mask &= 0x0001000100010001 ;
3344- mask = mask | mask >> 15 ;
3345- return (mask | mask >> 30 ) & 0xF ;
3344+ uint64x2_t bits = vreinterpretq_u64_u32 (msbs);
3345+ bits = vsraq_n_u64 (bits, bits, 31 );
3346+
3347+ uint8x16_t output = vreinterpretq_u8_u64 (bits);
3348+ constexpr int offset = detail::do_swap ? 7 : 0 ;
3349+
3350+ return vgetq_lane_u8 (output, offset) | vgetq_lane_u8 (output, offset + 8 ) << 2 ;
33463351 }
33473352
33483353 template <class A , class T , detail::enable_sized_t <T, 8 > = 0 >
0 commit comments