@@ -3281,70 +3281,76 @@ namespace xsimd
32813281 /* *******
32823282 * mask *
32833283 ********/
3284- template <class A , class T >
3285- XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
3284+ namespace detail
32863285 {
32873286#ifdef XSIMD_LITTLE_ENDIAN
32883287 static constexpr bool do_swap = false ;
32893288#else
32903289 static constexpr bool do_swap = true ;
32913290#endif
3291+ }
3292+
3293+ template <class A , class T , detail::enable_sized_t <T, 1 > = 0 >
3294+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
3295+ {
3296+ uint8x16_t inner = self;
3297+ XSIMD_IF_CONSTEXPR (detail::do_swap)
3298+ {
3299+ inner = vrev16q_u8 (inner);
3300+ }
32923301
3293- XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
3302+ uint16x8_t pairs = vreinterpretq_u16_u8 (inner);
3303+ uint8x8_t narrowed = vshrn_n_u16 (pairs, 4 );
3304+ XSIMD_IF_CONSTEXPR (detail::do_swap)
32943305 {
3295- uint8x16_t inner = self;
3296- XSIMD_IF_CONSTEXPR (do_swap)
3297- {
3298- inner = vrev16q_u8 (inner);
3299- }
3306+ narrowed = vrev64_u8 (narrowed);
3307+ }
33003308
3301- uint16x8_t pairs = vreinterpretq_u16_u8 (inner);
3302- uint8x8_t narrowed = vshrn_n_u16 (pairs, 4 );
3303- XSIMD_IF_CONSTEXPR (do_swap)
3304- {
3305- narrowed = vrev64_u8 (narrowed);
3306- }
3309+ uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3310+ mask &= 0x1111111111111111 ;
3311+ mask = mask | mask >> 3 ;
3312+ mask = (mask | mask >> 6 ) & 0x000F000F000F000F ;
3313+ mask = (mask | mask >> 12 ) & 0x000000FF000000FF ;
3314+ return (mask | mask >> 24 ) & 0xFFFF ;
3315+ }
33073316
3308- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3309- mask &= 0x1111111111111111 ;
3310- mask = mask | mask >> 3 ;
3311- mask = (mask | mask >> 6 ) & 0x000F000F000F000F ;
3312- mask = (mask | mask >> 12 ) & 0x000000FF000000FF ;
3313- return (mask | mask >> 24 ) & 0xFFFF ;
3314- }
3315- else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
3317+ template <class A , class T , detail::enable_sized_t <T, 2 > = 0 >
3318+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
3319+ {
3320+ uint8x8_t narrowed = vmovn_u16 (self);
3321+ XSIMD_IF_CONSTEXPR (detail::do_swap)
33163322 {
3317- uint8x8_t narrowed = vmovn_u16 (self);
3318- XSIMD_IF_CONSTEXPR (do_swap)
3319- {
3320- narrowed = vrev64_u8 (narrowed);
3321- }
3322-
3323- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3324- mask &= 0x0101010101010101 ;
3325- mask = mask | mask >> 7 ;
3326- mask = mask | mask >> 14 ;
3327- return (mask | mask >> 28 ) & 0xFF ;
3323+ narrowed = vrev64_u8 (narrowed);
33283324 }
3329- else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
3330- {
3331- uint16x4_t narrowed = vmovn_u32 (self);
3332- XSIMD_IF_CONSTEXPR (do_swap)
3333- {
3334- narrowed = vrev64_u16 (narrowed);
3335- }
33363325
3337- uint64_t mask = vget_lane_u64 (vreinterpret_u64_u16 (narrowed), 0 );
3338- mask &= 0x0001000100010001 ;
3339- mask = mask | mask >> 15 ;
3340- return (mask | mask >> 30 ) & 0xF ;
3341- }
3342- else
3326+ uint64_t mask = vget_lane_u64 (vreinterpret_u64_u8 (narrowed), 0 );
3327+ mask &= 0x0101010101010101 ;
3328+ mask = mask | mask >> 7 ;
3329+ mask = mask | mask >> 14 ;
3330+ return (mask | mask >> 28 ) & 0xFF ;
3331+ }
3332+
3333+ template <class A , class T , detail::enable_sized_t <T, 4 > = 0 >
3334+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
3335+ {
3336+ uint16x4_t narrowed = vmovn_u32 (self);
3337+ XSIMD_IF_CONSTEXPR (detail::do_swap)
33433338 {
3344- uint64_t mask_lo = vgetq_lane_u64 (self, 0 );
3345- uint64_t mask_hi = vgetq_lane_u64 (self, 1 );
3346- return ((mask_lo >> 63 ) | (mask_hi << 1 )) & 0x3 ;
3339+ narrowed = vrev64_u16 (narrowed);
33473340 }
3341+
3342+ uint64_t mask = vget_lane_u64 (vreinterpret_u64_u16 (narrowed), 0 );
3343+ mask &= 0x0001000100010001 ;
3344+ mask = mask | mask >> 15 ;
3345+ return (mask | mask >> 30 ) & 0xF ;
3346+ }
3347+
3348+ template <class A , class T , detail::enable_sized_t <T, 8 > = 0 >
3349+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon>) noexcept
3350+ {
3351+ uint64_t mask_lo = vgetq_lane_u64 (self, 0 );
3352+ uint64_t mask_hi = vgetq_lane_u64 (self, 1 );
3353+ return ((mask_lo >> 63 ) | (mask_hi << 1 )) & 0x3 ;
33483354 }
33493355 }
33503356
0 commit comments