Skip to content

Commit 24162c5

Browse files
WIP
1 parent fdcc012 commit 24162c5

File tree

1 file changed

+51
-39
lines changed

1 file changed

+51
-39
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,42 @@ namespace xsimd
973973

974974
}
975975

976+
/*
977+
* bitwise not generic utility
978+
*/
979+
980+
WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
981+
982+
namespace detail
983+
{
984+
XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
985+
{
986+
return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
987+
}
988+
989+
XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
990+
{
991+
return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
992+
}
993+
994+
XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
995+
{
996+
return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
997+
}
998+
999+
template <class V>
1000+
XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1001+
{
1002+
const neon_dispatcher::unary dispatcher = {
1003+
std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1004+
wrap::vmvnq_u32, wrap::vmvnq_s32,
1005+
bitwise_not_u64, bitwise_not_s64,
1006+
bitwise_not_f32)
1007+
};
1008+
return dispatcher.apply(arg);
1009+
}
1010+
}
1011+
9761012
/******
9771013
* lt *
9781014
******/
@@ -991,10 +1027,19 @@ namespace xsimd
9911027
return dispatcher.apply(register_type(lhs), register_type(rhs));
9921028
}
9931029

994-
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1030+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1031+
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1032+
{
1033+
using register_type = typename batch<T, A>::register_type;
1034+
return batch_bool<T, A>(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63));
1035+
}
1036+
1037+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
9951038
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
9961039
{
997-
return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
1040+
using register_type = typename batch<T, A>::register_type;
1041+
register_type acc = { 0x8FFFFFFFFFFFFFFFull, 0x8FFFFFFFFFFFFFFFull };
1042+
return batch_bool<T, A>(vreinterpretq_u64_s64(detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63))));
9981043
}
9991044

10001045
/******
@@ -1024,18 +1069,6 @@ namespace xsimd
10241069
/******
10251070
* gt *
10261071
******/
1027-
namespace detail
1028-
{
1029-
XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1030-
{
1031-
return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1032-
}
1033-
1034-
XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1035-
{
1036-
return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1037-
}
1038-
}
10391072

10401073
WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
10411074
WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
@@ -1055,14 +1088,15 @@ namespace xsimd
10551088
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10561089
{
10571090
using register_type = typename batch<T, A>::register_type;
1058-
return vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63);
1091+
return batch_bool<T, A>(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63));
10591092
}
10601093

10611094
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
10621095
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10631096
{
10641097
using register_type = typename batch<T, A>::register_type;
1065-
return detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqsubq_u64(register_type(rhs), register_type(lhs))), 63));
1098+
register_type acc = { 0x8FFFFFFFFFFFFFFFull, 0x8FFFFFFFFFFFFFFFull };
1099+
return batch_bool<T, A>(vreinterpretq_u64_s64(detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63))));
10661100
}
10671101

10681102
/******
@@ -1086,7 +1120,7 @@ namespace xsimd
10861120
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
10871121
XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10881122
{
1089-
return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
1123+
return detail::bitwise_not_neon(lt(rhs, lt, A {}));
10901124
}
10911125

10921126
/*******************
@@ -1234,28 +1268,6 @@ namespace xsimd
12341268
* bitwise_not *
12351269
***************/
12361270

1237-
WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1238-
1239-
namespace detail
1240-
{
1241-
XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1242-
{
1243-
return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1244-
}
1245-
1246-
template <class V>
1247-
XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1248-
{
1249-
const neon_dispatcher::unary dispatcher = {
1250-
std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1251-
wrap::vmvnq_u32, wrap::vmvnq_s32,
1252-
bitwise_not_u64, bitwise_not_s64,
1253-
bitwise_not_f32)
1254-
};
1255-
return dispatcher.apply(arg);
1256-
}
1257-
}
1258-
12591271
template <class A, class T, detail::enable_neon_type_t<T> = 0>
12601272
XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
12611273
{

0 commit comments

Comments
 (0)