@@ -717,16 +717,10 @@ namespace xsimd
717717 return vnegq_s32 (rhs);
718718 }
719719
720- template <class A , class T , detail::enable_sized_unsigned_t <T, 8 > = 0 >
721- XSIMD_INLINE batch<T, A> neg (batch<T, A> const & rhs, requires_arch<neon>) noexcept
722- {
723- return batch<T, A> { -rhs.get (0 ), -rhs.get (1 ) };
724- }
725-
726- template <class A , class T , detail::enable_sized_signed_t <T, 8 > = 0 >
720+ template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
727721 XSIMD_INLINE batch<T, A> neg (batch<T, A> const & rhs, requires_arch<neon>) noexcept
728722 {
729- return batch<T, A> { -rhs. get ( 0 ), -rhs. get ( 1 ) } ;
723+ return 0 - rhs ;
730724 }
731725
732726 template <class A >
@@ -923,16 +917,28 @@ namespace xsimd
923917 return dispatcher.apply (register_type (lhs), register_type (rhs));
924918 }
925919
926- template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
920+ template <class A , class T , detail::enable_sized_unsigned_t <T, 8 > = 0 >
921+ XSIMD_INLINE batch_bool<T, A> eq (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
922+ {
923+ auto eq32 = vceqq_u32 (vreinterpretq_u32_u64 (lhs.data ), vreinterpretq_u32_u64 (rhs.data ));
924+ auto rev32 = vrev64q_u32 (eq32);
925+ auto eq64 = vandq_u32 (eq32, rev32);
926+ return batch_bool<T, A>(vreinterpretq_u64_u32 (eq64));
927+ }
928+
929+ template <class A , class T , detail::enable_sized_signed_t <T, 8 > = 0 >
927930 XSIMD_INLINE batch_bool<T, A> eq (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
928931 {
929- return batch_bool<T, A>({ lhs.get (0 ) == rhs.get (0 ), lhs.get (1 ) == rhs.get (1 ) });
932+ auto eq32 = vceqq_u32 (vreinterpretq_u32_s64 (lhs.data ), vreinterpretq_u32_s64 (rhs.data ));
933+ auto rev32 = vrev64q_u32 (eq32);
934+ auto eq64 = vandq_u32 (eq32, rev32);
935+ return batch_bool<T, A>(vreinterpretq_u64_u32 (eq64));
930936 }
931937
932938 template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
933939 XSIMD_INLINE batch_bool<T, A> eq (batch_bool<T, A> const & lhs, batch_bool<T, A> const & rhs, requires_arch<neon>) noexcept
934940 {
935- return batch_bool <T, A>( { lhs.get ( 0 ) == rhs. get ( 0 ), lhs. get ( 1 ) == rhs.get ( 1 ) });
941+ return eq (batch <T, A> { lhs.data }, batch<T, A> { rhs.data }, A { });
936942 }
937943
938944 /* ************
@@ -985,10 +991,19 @@ namespace xsimd
985991 return dispatcher.apply (register_type (lhs), register_type (rhs));
986992 }
987993
988- template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
994+ template <class A , class T , detail::enable_sized_signed_t <T, 8 > = 0 >
989995 XSIMD_INLINE batch_bool<T, A> lt (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
990996 {
991- return batch_bool<T, A>({ lhs.get (0 ) < rhs.get (0 ), lhs.get (1 ) < rhs.get (1 ) });
997+ using register_type = typename batch<T, A>::register_type;
998+ return batch_bool<T, A>(vreinterpretq_u64_s64 (vshrq_n_s64 (vqsubq_s64 (register_type (lhs), register_type (rhs)), 63 )));
999+ }
1000+
1001+ template <class A , class T , detail::enable_sized_unsigned_t <T, 8 > = 0 >
1002+ XSIMD_INLINE batch_bool<T, A> lt (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
1003+ {
1004+ using register_type = typename batch<T, A>::register_type;
1005+ register_type acc = { 0x7FFFFFFFFFFFFFFFull , 0x7FFFFFFFFFFFFFFFull };
1006+ return batch_bool<T, A>(vreinterpretq_u64_s64 (vshrq_n_s64 (vreinterpretq_s64_u64 (vqaddq_u64 (vqsubq_u64 (register_type (rhs), register_type (lhs)), acc)), 63 )));
9921007 }
9931008
9941009 /* *****
@@ -1012,12 +1027,24 @@ namespace xsimd
10121027 template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
10131028 XSIMD_INLINE batch_bool<T, A> le (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
10141029 {
1015- return batch_bool<T, A>({ lhs. get ( 0 ) <= rhs. get ( 0 ), lhs. get ( 1 ) <= rhs. get ( 1 ) } );
1030+ return !( lhs > rhs);
10161031 }
10171032
10181033 /* *****
10191034 * gt *
10201035 ******/
1036+ namespace detail
1037+ {
1038+ XSIMD_INLINE int64x2_t bitwise_not_s64 (int64x2_t arg) noexcept
1039+ {
1040+ return vreinterpretq_s64_s32 (vmvnq_s32 (vreinterpretq_s32_s64 (arg)));
1041+ }
1042+
1043+ XSIMD_INLINE uint64x2_t bitwise_not_u64 (uint64x2_t arg) noexcept
1044+ {
1045+ return vreinterpretq_u64_u32 (vmvnq_u32 (vreinterpretq_u32_u64 (arg)));
1046+ }
1047+ }
10211048
10221049 WRAP_BINARY_INT_EXCLUDING_64 (vcgtq, detail::comp_return_type)
10231050 WRAP_BINARY_FLOAT (vcgtq, detail::comp_return_type)
@@ -1033,10 +1060,19 @@ namespace xsimd
10331060 return dispatcher.apply (register_type (lhs), register_type (rhs));
10341061 }
10351062
1036- template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
1063+ template <class A , class T , detail::enable_sized_signed_t <T, 8 > = 0 >
10371064 XSIMD_INLINE batch_bool<T, A> gt (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
10381065 {
1039- return batch_bool<T, A>({ lhs.get (0 ) > rhs.get (0 ), lhs.get (1 ) > rhs.get (1 ) });
1066+ using register_type = typename batch<T, A>::register_type;
1067+ return batch_bool<T, A>(vreinterpretq_u64_s64 (vshrq_n_s64 (vqsubq_s64 (register_type (rhs), register_type (lhs)), 63 )));
1068+ }
1069+
1070+ template <class A , class T , detail::enable_sized_unsigned_t <T, 8 > = 0 >
1071+ XSIMD_INLINE batch_bool<T, A> gt (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
1072+ {
1073+ using register_type = typename batch<T, A>::register_type;
1074+ register_type acc = { 0x7FFFFFFFFFFFFFFFull , 0x7FFFFFFFFFFFFFFFull };
1075+ return batch_bool<T, A>(vreinterpretq_u64_s64 (vshrq_n_s64 (vreinterpretq_s64_u64 (vqaddq_u64 (vqsubq_u64 (register_type (lhs), register_type (rhs)), acc)), 63 )));
10401076 }
10411077
10421078 /* *****
@@ -1060,7 +1096,7 @@ namespace xsimd
10601096 template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
10611097 XSIMD_INLINE batch_bool<T, A> ge (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
10621098 {
1063- return batch_bool<T, A>({ lhs. get ( 0 ) >= rhs. get ( 0 ), lhs. get ( 1 ) >= rhs. get ( 1 ) } );
1099+ return !( lhs < rhs);
10641100 }
10651101
10661102 /* ******************
@@ -1212,16 +1248,6 @@ namespace xsimd
12121248
12131249 namespace detail
12141250 {
1215- XSIMD_INLINE int64x2_t bitwise_not_s64 (int64x2_t arg) noexcept
1216- {
1217- return vreinterpretq_s64_s32 (vmvnq_s32 (vreinterpretq_s32_s64 (arg)));
1218- }
1219-
1220- XSIMD_INLINE uint64x2_t bitwise_not_u64 (uint64x2_t arg) noexcept
1221- {
1222- return vreinterpretq_u64_u32 (vmvnq_u32 (vreinterpretq_u32_u64 (arg)));
1223- }
1224-
12251251 XSIMD_INLINE float32x4_t bitwise_not_f32 (float32x4_t arg) noexcept
12261252 {
12271253 return vreinterpretq_f32_u32 (vmvnq_u32 (vreinterpretq_u32_f32 (arg)));
@@ -1314,7 +1340,7 @@ namespace xsimd
13141340 template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
13151341 XSIMD_INLINE batch<T, A> min (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
13161342 {
1317- return { std::min (lhs. get ( 0 ) , rhs. get ( 0 )), std::min ( lhs. get ( 1 ), rhs. get ( 1 )) } ;
1343+ return select (lhs > rhs , rhs, lhs) ;
13181344 }
13191345
13201346 /* ******
@@ -1338,7 +1364,7 @@ namespace xsimd
13381364 template <class A , class T , detail::enable_sized_integral_t <T, 8 > = 0 >
13391365 XSIMD_INLINE batch<T, A> max (batch<T, A> const & lhs, batch<T, A> const & rhs, requires_arch<neon>) noexcept
13401366 {
1341- return { std::max (lhs. get ( 0 ), rhs. get ( 0 )), std::max ( lhs. get ( 1 ) , rhs. get ( 1 )) } ;
1367+ return select (lhs > rhs, lhs, rhs) ;
13421368 }
13431369
13441370 /* ******
0 commit comments