|
13 | 13 | #define XSIMD_NEON_HPP |
14 | 14 |
|
15 | 15 | #include <algorithm> |
| 16 | +#include <array> |
16 | 17 | #include <complex> |
17 | 18 | #include <tuple> |
18 | 19 | #include <type_traits> |
@@ -2914,6 +2915,56 @@ namespace xsimd |
2914 | 2915 | { |
2915 | 2916 | return vreinterpretq_s64_u64(swizzle(vreinterpretq_u64_s64(self), mask, A {})); |
2916 | 2917 | } |
| 2918 | + |
| 2919 | + template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3> |
| 2920 | + XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, |
| 2921 | + batch_constant<uint32_t, A, V0, V1, V2, V3> mask, |
| 2922 | + requires_arch<neon>) noexcept |
| 2923 | + { |
| 2924 | + constexpr bool is_identity = detail::is_identity(mask); |
| 2925 | + constexpr bool is_dup_lo = detail::is_dup_lo(mask); |
| 2926 | + constexpr bool is_dup_hi = detail::is_dup_hi(mask); |
| 2927 | + |
| 2928 | + XSIMD_IF_CONSTEXPR(is_identity) |
| 2929 | + { |
| 2930 | + return self; |
| 2931 | + } |
| 2932 | + XSIMD_IF_CONSTEXPR(is_dup_lo) |
| 2933 | + { |
| 2934 | + XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1) |
| 2935 | + { |
| 2936 | + return vreinterpretq_u32_u64(vdupq_lane_u64(vget_low_u64(vreinterpretq_u64_u32(self)), 0)); |
| 2937 | + } |
| 2938 | + XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 0) |
| 2939 | + { |
| 2940 | + return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_low_u32(self))), 0)); |
| 2941 | + } |
| 2942 | + return vdupq_n_u32(vgetq_lane_u32(self, V0)); |
| 2943 | + } |
| 2944 | + XSIMD_IF_CONSTEXPR(is_dup_hi) |
| 2945 | + { |
| 2946 | + XSIMD_IF_CONSTEXPR(V0 == 2 && V1 == 3) |
| 2947 | + { |
| 2948 | + return vreinterpretq_u32_u64(vdupq_lane_u64(vget_high_u64(vreinterpretq_u64_u32(self)), 0)); |
| 2949 | + } |
| 2950 | + XSIMD_IF_CONSTEXPR(V0 == 3 && V1 == 2) |
| 2951 | + { |
| 2952 | + return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_high_u32(self))), 0)); |
| 2953 | + } |
| 2954 | + return vdupq_n_u32(vgetq_lane_u32(self, V0)); |
| 2955 | + } |
| 2956 | + std::array<uint32_t, 4> data; |
| 2957 | + self.store_aligned(data.data()); |
| 2958 | + return set(batch<uint32_t, A>(), A(), data[V0], data[V1], data[V2], data[V3]); |
| 2959 | + } |
| 2960 | + |
| 2961 | + template <class A, uint32_t V0, uint32_t V1> |
| 2962 | + XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, |
| 2963 | + batch_constant<int32_t, A, V0, V1> mask, |
| 2964 | + requires_arch<neon>) noexcept |
| 2965 | + { |
| 2966 | + return vreinterpretq_s32_u32(swizzle(vreinterpretq_u32_s32(self), mask, A {})); |
| 2967 | + } |
2917 | 2968 | } |
2918 | 2969 |
|
2919 | 2970 | } |
|
0 commit comments