Skip to content

Commit 0e776ef

Browse files
Improve swizzle for arm32 - [u]int32
1 parent 9505dc4 commit 0e776ef

File tree

1 file changed

+51
-0
lines changed

1 file changed

+51
-0
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_NEON_HPP
1414

1515
#include <algorithm>
16+
#include <array>
1617
#include <complex>
1718
#include <tuple>
1819
#include <type_traits>
@@ -2914,6 +2915,56 @@ namespace xsimd
29142915
{
29152916
return vreinterpretq_s64_u64(swizzle(vreinterpretq_u64_s64(self), mask, A {}));
29162917
}
2918+
2919+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
2920+
XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
2921+
batch_constant<uint32_t, A, V0, V1, V2, V3> mask,
2922+
requires_arch<neon>) noexcept
2923+
{
2924+
constexpr bool is_identity = detail::is_identity(mask);
2925+
constexpr bool is_dup_lo = detail::is_dup_lo(mask);
2926+
constexpr bool is_dup_hi = detail::is_dup_hi(mask);
2927+
2928+
XSIMD_IF_CONSTEXPR(is_identity)
2929+
{
2930+
return self;
2931+
}
2932+
XSIMD_IF_CONSTEXPR(is_dup_lo)
2933+
{
2934+
XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1)
2935+
{
2936+
return vreinterpretq_u32_u64(vdupq_lane_u64(vget_low_u64(vreinterpretq_u64_u32(self)), 0));
2937+
}
2938+
XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 0)
2939+
{
2940+
return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_low_u32(self))), 0));
2941+
}
2942+
return vdupq_n_u32(vgetq_lane_u32(self, V0));
2943+
}
2944+
XSIMD_IF_CONSTEXPR(is_dup_hi)
2945+
{
2946+
XSIMD_IF_CONSTEXPR(V0 == 2 && V1 == 3)
2947+
{
2948+
return vreinterpretq_u32_u64(vdupq_lane_u64(vget_high_u64(vreinterpretq_u64_u32(self)), 0));
2949+
}
2950+
XSIMD_IF_CONSTEXPR(V0 == 3 && V1 == 2)
2951+
{
2952+
return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_high_u32(self))), 0));
2953+
}
2954+
return vdupq_n_u32(vgetq_lane_u32(self, V0));
2955+
}
2956+
std::array<uint32_t, 4> data;
2957+
self.store_aligned(data.data());
2958+
return set(batch<uint32_t, A>(), A(), data[V0], data[V1], data[V2], data[V3]);
2959+
}
2960+
2961+
template <class A, uint32_t V0, uint32_t V1>
2962+
XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
2963+
batch_constant<int32_t, A, V0, V1> mask,
2964+
requires_arch<neon>) noexcept
2965+
{
2966+
return vreinterpretq_s32_u32(swizzle(vreinterpretq_u32_s32(self), mask, A {}));
2967+
}
29172968
}
29182969

29192970
}

0 commit comments

Comments
 (0)