Skip to content

Commit e9c40fb

Browse files
WIP
1 parent fdcc012 commit e9c40fb

File tree

1 file changed

+32
-25
lines changed

1 file changed

+32
-25
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -991,10 +991,19 @@ namespace xsimd
991991
return dispatcher.apply(register_type(lhs), register_type(rhs));
992992
}
993993

994-
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
994+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
995+
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
996+
{
997+
using register_type = typename batch<T, A>::register_type;
998+
return batch_bool<T, A>(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63));
999+
}
1000+
1001+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
9951002
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
9961003
{
997-
return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
1004+
using register_type = typename batch<T, A>::register_type;
1005+
register_type acc = { 0x8FFFFFFFFFFFFFFFull, 0x8FFFFFFFFFFFFFFFull };
1006+
return batch_bool<T, A>(vreinterpretq_u64_s64(detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63))));
9981007
}
9991008

10001009
/******
@@ -1035,6 +1044,23 @@ namespace xsimd
10351044
{
10361045
return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
10371046
}
1047+
1048+
XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1049+
{
1050+
return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1051+
}
1052+
1053+
template <class V>
1054+
XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1055+
{
1056+
const neon_dispatcher::unary dispatcher = {
1057+
std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1058+
wrap::vmvnq_u32, wrap::vmvnq_s32,
1059+
bitwise_not_u64, bitwise_not_s64,
1060+
bitwise_not_f32)
1061+
};
1062+
return dispatcher.apply(arg);
1063+
}
10381064
}
10391065

10401066
WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
@@ -1055,14 +1081,15 @@ namespace xsimd
10551081
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10561082
{
10571083
using register_type = typename batch<T, A>::register_type;
1058-
return vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63);
1084+
return batch_bool<T, A>(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63));
10591085
}
10601086

10611087
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
10621088
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10631089
{
10641090
using register_type = typename batch<T, A>::register_type;
1065-
return detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqsubq_u64(register_type(rhs), register_type(lhs))), 63));
1091+
register_type acc = { 0x8FFFFFFFFFFFFFFFull, 0x8FFFFFFFFFFFFFFFull };
1092+
return batch_bool<T, A>(vreinterpretq_u64_s64(detail::bitwise_not_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63))));
10661093
}
10671094

10681095
/******
@@ -1086,7 +1113,7 @@ namespace xsimd
10861113
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
10871114
XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10881115
{
1089-
return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
1116+
return detail::bitwise_not_neon(lt(rhs, lt, A {}));
10901117
}
10911118

10921119
/*******************
@@ -1236,26 +1263,6 @@ namespace xsimd
12361263

12371264
WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
12381265

1239-
namespace detail
1240-
{
1241-
XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1242-
{
1243-
return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1244-
}
1245-
1246-
template <class V>
1247-
XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1248-
{
1249-
const neon_dispatcher::unary dispatcher = {
1250-
std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1251-
wrap::vmvnq_u32, wrap::vmvnq_s32,
1252-
bitwise_not_u64, bitwise_not_s64,
1253-
bitwise_not_f32)
1254-
};
1255-
return dispatcher.apply(arg);
1256-
}
1257-
}
1258-
12591266
template <class A, class T, detail::enable_neon_type_t<T> = 0>
12601267
XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
12611268
{

0 commit comments

Comments
 (0)