|
23 | 23 | // Wrap intrinsics so we can pass them as function pointers |
24 | 24 | // - OP: intrinsics name prefix, e.g., vorrq |
25 | 25 | // - RT: type traits to deduce intrinsics return types |
26 | | -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ |
| 26 | +#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ |
27 | 27 | namespace wrap \ |
28 | 28 | { \ |
29 | 29 | inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ |
30 | 30 | { \ |
31 | 31 | return ::OP##_u8(a, b); \ |
32 | 32 | } \ |
33 | | - inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ |
34 | | - { \ |
35 | | - return ::OP##_s8(a, b); \ |
36 | | - } \ |
37 | 33 | inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ |
38 | 34 | { \ |
39 | 35 | return ::OP##_u16(a, b); \ |
40 | 36 | } \ |
41 | | - inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ |
42 | | - { \ |
43 | | - return ::OP##_s16(a, b); \ |
44 | | - } \ |
45 | 37 | inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ |
46 | 38 | { \ |
47 | 39 | return ::OP##_u32(a, b); \ |
48 | 40 | } \ |
49 | | - inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ |
50 | | - { \ |
51 | | - return ::OP##_s32(a, b); \ |
52 | | - } \ |
| 41 | + } |
| 42 | + |
| 43 | +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ |
| 44 | + WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ |
| 45 | + namespace wrap \ |
| 46 | + { \ |
| 47 | + inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ |
| 48 | + { \ |
| 49 | + return ::OP##_s8(a, b); \ |
| 50 | + } \ |
| 51 | + inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ |
| 52 | + { \ |
| 53 | + return ::OP##_s16(a, b); \ |
| 54 | + } \ |
| 55 | + inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ |
| 56 | + { \ |
| 57 | + return ::OP##_s32(a, b); \ |
| 58 | + } \ |
53 | 59 | } |
54 | 60 |
|
55 | 61 | #define WRAP_BINARY_INT(OP, RT) \ |
@@ -204,6 +210,10 @@ namespace xsimd |
204 | 210 | uint32x4_t, int32x4_t, |
205 | 211 | float32x4_t>; |
206 | 212 |
|
| 213 | + using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, |
| 214 | + uint16x8_t, int16x8_t, |
| 215 | + uint32x4_t, int32x4_t>; |
| 216 | + |
207 | 217 | /************************** |
208 | 218 | * comparison dispatchers * |
209 | 219 | **************************/ |
@@ -744,6 +754,22 @@ namespace xsimd |
744 | 754 | return dispatcher.apply(register_type(lhs), register_type(rhs)); |
745 | 755 | } |
746 | 756 |
|
| 757 | + /******* |
| 758 | + * avg * |
| 759 | + *******/ |
| 760 | + |
| 761 | + WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type) |
| 762 | + |
| 763 | + template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type> |
| 764 | + inline batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept |
| 765 | + { |
| 766 | + using register_type = typename batch<T, A>::register_type; |
| 767 | + const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = { |
| 768 | + std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32) |
| 769 | + }; |
| 770 | + return dispatcher.apply(register_type(lhs), register_type(rhs)); |
| 771 | + } |
| 772 | + |
747 | 773 | /******** |
748 | 774 | * sadd * |
749 | 775 | ********/ |
|
0 commit comments