Skip to content

Commit 10bab43

Browse files
committed
Implement optimized movemasks for NEON
While the scalar post-processing required to obtain one bit per lane makes this more expensive than directly supporting variable-sized bit groups (as done in Zstandard[^1]), the result is still an improvement over the current lane-by-lane algorithm. [^1]: See facebook/zstd#3139, namely `ZSTD_row_matchMaskGroupWidth`.
1 parent f5e485e commit 10bab43

File tree

3 files changed

+85
-12
lines changed

3 files changed

+85
-12
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3278,6 +3278,80 @@ namespace xsimd
32783278
return { batch<widen_t<T>, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch<widen_t<T>, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) };
32793279
}
32803280

3281+
/********
3282+
* mask *
3283+
********/
3284+
template <class A, class T>
3285+
XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
3286+
{
3287+
#ifdef XSIMD_LITTLE_ENDIAN
3288+
static constexpr bool do_swap = false;
3289+
#else
3290+
static constexpr bool do_swap = true;
3291+
#endif
3292+
3293+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
3294+
{
3295+
uint8x16_t inner = self;
3296+
XSIMD_IF_CONSTEXPR(do_swap)
3297+
{
3298+
inner = vrev16q_u8(inner);
3299+
}
3300+
3301+
uint16x8_t pairs = vreinterpretq_u16_u8(inner);
3302+
uint8x8_t narrowed = vshrn_n_u16(pairs, 4);
3303+
XSIMD_IF_CONSTEXPR(do_swap)
3304+
{
3305+
narrowed = vrev64_u8(narrowed);
3306+
}
3307+
3308+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);
3309+
mask &= 0x1111111111111111;
3310+
mask = mask | mask >> 3;
3311+
mask = (mask | mask >> 6) & 0x000F000F000F000F;
3312+
mask = (mask | mask >> 12) & 0x000000FF000000FF;
3313+
return (mask | mask >> 24) & 0xFFFF;
3314+
}
3315+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
3316+
{
3317+
uint8x8_t narrowed = vmovn_u16(self);
3318+
XSIMD_IF_CONSTEXPR(do_swap)
3319+
{
3320+
narrowed = vrev64_u8(narrowed);
3321+
}
3322+
3323+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);
3324+
mask &= 0x0101010101010101;
3325+
mask = mask | mask >> 7;
3326+
mask = mask | mask >> 14;
3327+
return (mask | mask >> 28) & 0xFF;
3328+
}
3329+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
3330+
{
3331+
uint16x4_t narrowed = vmovn_u32(self);
3332+
XSIMD_IF_CONSTEXPR(do_swap)
3333+
{
3334+
narrowed = vrev64_u16(narrowed);
3335+
}
3336+
3337+
uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(narrowed), 0);
3338+
mask &= 0x0001000100010001;
3339+
mask = mask | mask >> 15;
3340+
return (mask | mask >> 30) & 0xF;
3341+
}
3342+
else
3343+
{
3344+
uint32x2_t narrowed = vmovn_u64(self);
3345+
XSIMD_IF_CONSTEXPR(do_swap)
3346+
{
3347+
narrowed = vrev64_u32(narrowed);
3348+
}
3349+
3350+
uint64_t mask32 = vget_lane_u64(vreinterpret_u64_u32(narrowed), 0);
3351+
mask32 &= 0x0000000100000001;
3352+
return (mask32 | mask32 >> 31) & 0x3;
3353+
}
3354+
}
32813355
}
32823356

32833357
}

include/xsimd/config/xsimd_config.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,17 @@
1616
#define XSIMD_VERSION_MINOR 0
1717
#define XSIMD_VERSION_PATCH 0
1818

19+
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
20+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
21+
#define XSIMD_LITTLE_ENDIAN
22+
#endif
23+
#elif defined(_WIN32)
24+
// We can safely assume that Windows is always little endian
25+
#define XSIMD_LITTLE_ENDIAN
26+
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
27+
#define XSIMD_LITTLE_ENDIAN
28+
#endif
29+
1930
/**
2031
* high level free functions
2132
*

include/xsimd/math/xsimd_rem_pio2.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,6 @@ namespace xsimd
4747
* ====================================================
4848
*/
4949

50-
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
51-
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
52-
#define XSIMD_LITTLE_ENDIAN
53-
#endif
54-
#elif defined(_WIN32)
55-
// We can safely assume that Windows is always little endian
56-
#define XSIMD_LITTLE_ENDIAN
57-
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
58-
#define XSIMD_LITTLE_ENDIAN
59-
#endif
60-
6150
#ifdef XSIMD_LITTLE_ENDIAN
6251
#define LOW_WORD_IDX 0
6352
#define HIGH_WORD_IDX sizeof(std::uint32_t)
@@ -708,7 +697,6 @@ namespace xsimd
708697
}
709698
}
710699

711-
#undef XSIMD_LITTLE_ENDIAN
712700
#undef SET_LOW_WORD
713701
#undef SET_HIGH_WORD
714702
#undef GET_LOW_WORD

0 commit comments

Comments
 (0)