Skip to content

Commit b571831

Browse files
More efficient movemask for aarch64
1 parent f5e485e commit b571831

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,24 @@ namespace xsimd
608608
return vmaxq_f64(lhs, rhs);
609609
}
610610

611+
/********
612+
* mask *
613+
********/
614+
615+
template <class A, class T, detail::enable_sized_t<T, 1> = 0>
616+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
617+
{
618+
// From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h
619+
// Extract most significant bit
620+
uint8x16_t msbs = vshrq_n_u8(self, 7);
621+
// Position it appropriately
622+
static constexpr uint8_t shift_table[16] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
623+
int8x16_t shifts = vld1q_s8(shift_table);
624+
uint8x16_t positioned = vshlq_u8(msbs, shifts);
625+
// Horizontal reduction
626+
return vaddv_u8(vget_low_u8(positioned)) | (vaddv_u8(vget_high_u8(positioned)) << 8);
627+
}
628+
611629
/*******
612630
* abs *
613631
*******/

0 commit comments

Comments
 (0)