@@ -608,6 +608,37 @@ namespace xsimd
608608 return vmaxq_f64 (lhs, rhs);
609609 }
610610
611+ /* *******
612+ * mask *
613+ ********/
614+
615+ template <class A , class T , detail::enable_sized_t <T, 1 > = 0 >
616+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon64>) noexcept
617+ {
618+ // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
619+ // Extract most significant bit
620+ uint8x16_t msbs = vshrq_n_u8 (self, 7 );
621+ // Position it appropriately
622+ static constexpr int8_t shift_table[16 ] = { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 };
623+ int8x16_t shifts = vld1q_s8 (shift_table);
624+ uint8x16_t positioned = vshlq_u8 (msbs, shifts);
625+ // Horizontal reduction
626+ return vaddv_u8 (vget_low_u8 (positioned)) | (vaddv_u8 (vget_high_u8 (positioned)) << 8 );
627+ }
628+
629+ template <class A , class T , detail::enable_sized_t <T, 2 > = 0 >
630+ XSIMD_INLINE uint64_t mask (batch_bool<T, A> const & self, requires_arch<neon64>) noexcept
631+ {
632+ // Extract most significant bit
633+ uint16x8_t msbs = vshrq_n_u8 (self, 15 );
634+ // Position it appropriately
635+ static constexpr int8_t shift_table[8 ] = { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 };
636+ int16x8_t shifts = vld1q_s16 (shift_table);
637+ uint16x8_t positioned = vshlq_u16 (msbs, shifts);
638+ // Horizontal reduction
639+ return vaddvq_u8 (positioned);
640+ }
641+
611642 /* ******
612643 * abs *
613644 *******/
0 commit comments