Skip to content

Commit bde4df7

Browse files
Improve Neon implementation of comparison operator for 64 bit integers
Previous implementation relied on per-lane accessors, which is not efficient. Rely on adds/subs combination instead.
1 parent 9f587fc commit bde4df7

File tree

1 file changed

+55
-29
lines changed

1 file changed

+55
-29
lines changed

include/xsimd/arch/xsimd_neon.hpp

Lines changed: 55 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -717,16 +717,10 @@ namespace xsimd
717717
return vnegq_s32(rhs);
718718
}
719719

720-
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
721-
XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
722-
{
723-
return batch<T, A> { -rhs.get(0), -rhs.get(1) };
724-
}
725-
726-
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
720+
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
727721
XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
728722
{
729-
return batch<T, A> { -rhs.get(0), -rhs.get(1) };
723+
return 0 - rhs;
730724
}
731725

732726
template <class A>
@@ -923,16 +917,28 @@ namespace xsimd
923917
return dispatcher.apply(register_type(lhs), register_type(rhs));
924918
}
925919

926-
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
920+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
921+
XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
922+
{
923+
auto eq32 = vceqq_u32(vreinterpretq_u32_u64(lhs.data), vreinterpretq_u32_u64(rhs.data));
924+
auto rev32 = vrev64q_u32(eq32);
925+
auto eq64 = vandq_u32(eq32, rev32);
926+
return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
927+
}
928+
929+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
927930
XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
928931
{
929-
return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
932+
auto eq32 = vceqq_u32(vreinterpretq_u32_s64(lhs.data), vreinterpretq_u32_s64(rhs.data));
933+
auto rev32 = vrev64q_u32(eq32);
934+
auto eq64 = vandq_u32(eq32, rev32);
935+
return batch_bool<T, A>(vreinterpretq_u64_u32(eq64));
930936
}
931937

932938
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
933939
XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
934940
{
935-
return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
941+
return eq(batch<T, A> { lhs.data }, batch<T, A> { rhs.data }, A {});
936942
}
937943

938944
/*************
@@ -985,10 +991,19 @@ namespace xsimd
985991
return dispatcher.apply(register_type(lhs), register_type(rhs));
986992
}
987993

988-
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
994+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
989995
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
990996
{
991-
return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
997+
using register_type = typename batch<T, A>::register_type;
998+
return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63)));
999+
}
1000+
1001+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1002+
XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1003+
{
1004+
using register_type = typename batch<T, A>::register_type;
1005+
register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
1006+
return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63)));
9921007
}
9931008

9941009
/******
@@ -1012,12 +1027,24 @@ namespace xsimd
10121027
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
10131028
XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10141029
{
1015-
return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
1030+
return !(lhs > rhs);
10161031
}
10171032

10181033
/******
10191034
* gt *
10201035
******/
1036+
namespace detail
1037+
{
1038+
XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1039+
{
1040+
return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1041+
}
1042+
1043+
XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1044+
{
1045+
return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1046+
}
1047+
}
10211048

10221049
WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
10231050
WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
@@ -1033,10 +1060,19 @@ namespace xsimd
10331060
return dispatcher.apply(register_type(lhs), register_type(rhs));
10341061
}
10351062

1036-
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1063+
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
10371064
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10381065
{
1039-
return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
1066+
using register_type = typename batch<T, A>::register_type;
1067+
return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63)));
1068+
}
1069+
1070+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1071+
XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1072+
{
1073+
using register_type = typename batch<T, A>::register_type;
1074+
register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull };
1075+
return batch_bool<T, A>(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63)));
10401076
}
10411077

10421078
/******
@@ -1060,7 +1096,7 @@ namespace xsimd
10601096
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
10611097
XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
10621098
{
1063-
return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
1099+
return !(lhs < rhs);
10641100
}
10651101

10661102
/*******************
@@ -1212,16 +1248,6 @@ namespace xsimd
12121248

12131249
namespace detail
12141250
{
1215-
XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1216-
{
1217-
return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1218-
}
1219-
1220-
XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1221-
{
1222-
return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1223-
}
1224-
12251251
XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
12261252
{
12271253
return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
@@ -1314,7 +1340,7 @@ namespace xsimd
13141340
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
13151341
XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
13161342
{
1317-
return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
1343+
return select(lhs > rhs, rhs, lhs);
13181344
}
13191345

13201346
/*******
@@ -1338,7 +1364,7 @@ namespace xsimd
13381364
template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
13391365
XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
13401366
{
1341-
return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
1367+
return select(lhs > rhs, lhs, rhs);
13421368
}
13431369

13441370
/*******

0 commit comments

Comments
 (0)