More efficient movemask for aarch64

serge-sans-paille · serge-sans-paille · commit b5718313dd7f · 2025-12-27T16:31:35.000+01:00
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
@@ -608,6 +608,24 @@ namespace xsimd
             return vmaxq_f64(lhs, rhs);
         }
 
+        /********
+         * mask *
+         ********/
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
+        {
+          // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
+          // Extract most significant bit
+          uint8x16_t msbs = vshrq_n_u8(self, 7);
+          // Position it appropriately
+          static constexpr uint8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+          int8x16_t shifts = vld1q_s8(shift_table);
+          uint8x16_t positioned = vshlq_u8(msbs, shifts);
+          // Horizontal reduction
+          return vaddv_u8(vget_low_u8(positioned)) | (vaddv_u8(vget_high_u8(positioned)) << 8);
+        }
+
         /*******
          * abs *
          *******/