Provide generic, intel, wasm and arm implementation for avg

serge-sans-paille · serge-sans-paille · commit d7c3383705b8 · 2024-02-27T16:05:37.000Z
That's the non-rounding implementation.
diff --git a/include/xsimd/arch/generic/xsimd_generic_math.hpp b/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -26,7 +26,7 @@ namespace xsimd
 
         using namespace types;
         // abs
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        template <class A, class T, class>
         inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
         {
             if (std::is_unsigned<T>::value)
@@ -45,6 +45,39 @@ namespace xsimd
             return hypot(z.real(), z.imag());
         }
 
+        // avg
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::false_type) noexcept
+            {
+                return (x & y) + ((x ^ y) >> 1);
+            }
+
+            template <class A, class T>
+            inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::true_type) noexcept
+            {
+                // Inspired by
+                // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+                auto t = (x & y) + ((x ^ y) >> 1);
+                auto t_u = bitwise_cast<typename std::make_unsigned<T>::type>(t);
+                auto avg = t + (bitwise_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+                return avg;
+            }
+
+            template <class A, class T>
+            inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::false_type, std::true_type) noexcept
+            {
+                return (x + y) / 2;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+        {
+            return detail::avg(x, y, typename std::is_integral<T>::type {}, typename std::is_signed<T>::type {});
+        }
+
         // batch_cast
         template <class A, class T>
         inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -76,6 +76,26 @@ namespace xsimd
             }
         }
 
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return batch<T, A>(_mm256_avg_epu8(self, other)) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return batch<T, A>(_mm256_avg_epu16(self, other)) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // bitwise_and
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -112,6 +112,26 @@ namespace xsimd
             }
         }
 
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return batch<T, A>(_mm512_avg_epu8(self, other)) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return batch<T, A>(_mm512_avg_epu16(self, other)) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
@@ -23,33 +23,39 @@
 // Wrap intrinsics so we can pass them as function pointers
 // - OP: intrinsics name prefix, e.g., vorrq
 // - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                \
+#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                               \
     namespace wrap                                                          \
     {                                                                       \
         inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
         {                                                                   \
             return ::OP##_u8(a, b);                                         \
         }                                                                   \
-        inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept     \
-        {                                                                   \
-            return ::OP##_s8(a, b);                                         \
-        }                                                                   \
         inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
         {                                                                   \
             return ::OP##_u16(a, b);                                        \
         }                                                                   \
-        inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept    \
-        {                                                                   \
-            return ::OP##_s16(a, b);                                        \
-        }                                                                   \
         inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
         {                                                                   \
             return ::OP##_u32(a, b);                                        \
         }                                                                   \
-        inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept    \
-        {                                                                   \
-            return ::OP##_s32(a, b);                                        \
-        }                                                                   \
+    }
+
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                             \
+    WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
+        {                                                                \
+            return ::OP##_s8(a, b);                                      \
+        }                                                                \
+        inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
+        {                                                                \
+            return ::OP##_s16(a, b);                                     \
+        }                                                                \
+        inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
+        {                                                                \
+            return ::OP##_s32(a, b);                                     \
+        }                                                                \
     }
 
 #define WRAP_BINARY_INT(OP, RT)                                             \
@@ -204,6 +210,10 @@ namespace xsimd
                                                                     uint32x4_t, int32x4_t,
                                                                     float32x4_t>;
 
+            using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t>;
+
             /**************************
              * comparison dispatchers *
              **************************/
@@ -744,6 +754,22 @@ namespace xsimd
             return dispatcher.apply(register_type(lhs), register_type(rhs));
         }
 
+        /*******
+         * avg *
+         *******/
+
+        WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+
+        template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+        inline batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+                std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
         /********
          * sadd *
          ********/
diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp
@@ -142,6 +142,27 @@ namespace xsimd
         return x + y;
     }
 
+    template <class T, class Tp>
+    inline typename std::common_type<T, Tp>::type avg(T const& x, Tp const& y) noexcept
+    {
+        using common_type = typename std::common_type<T, Tp>::type;
+        if (std::is_floating_point<common_type>::value)
+            return (x + y) / 2;
+        else if (std::is_unsigned<common_type>::value)
+        {
+            return (x & y) + ((x ^ y) >> 1);
+        }
+        else
+        {
+            // Inspired by
+            // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+            auto t = (x & y) + ((x ^ y) >> 1);
+            auto t_u = static_cast<typename std::make_unsigned<common_type>::type>(t);
+            auto avg = t + (static_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+            return avg;
+        }
+    }
+
     template <class T>
     inline T incr(T const& x) noexcept
     {
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -60,6 +60,8 @@ namespace xsimd
         inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
         inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
 
         // abs
         template <class A>
@@ -148,6 +150,26 @@ namespace xsimd
             return _mm_movemask_epi8(self) != 0;
         }
 
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return batch<T, A>(_mm_avg_epu8(self, other)) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return batch<T, A>(_mm_avg_epu16(self, other)) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // batch_bool_cast
         template <class A, class T_out, class T_in>
         inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
@@ -37,6 +37,8 @@ namespace xsimd
         inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
         inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
 
         // abs
         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
@@ -116,6 +118,26 @@ namespace xsimd
             return wasm_f64x2_add(self, other);
         }
 
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return batch<T, A>(wasm_u8x16_avgr(self, other)) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return batch<T, A>(wasm_u16x8_avgr(self, other)) - adj;
+            }
+            else
+            {
+                return avg(self, other, generic {});
+            }
+        }
+
         // all
         template <class A>
         inline bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -202,6 +202,21 @@ namespace xsimd
         return kernel::atanh<A>(x, A {});
     }
 
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the average of batches \c x and \c y
+     * @param x batch of T
+     * @param y batch of T
+     * @return the average of elements between \c x and \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::avg<A>(x, y, A {});
+    }
+
     /**
      * @ingroup batch_conversion
      *
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
@@ -737,6 +737,26 @@ struct batch_test
         }
     }
 
+    void test_avg() const
+    {
+        array_type expected;
+        std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(),
+                       [](const value_type& l, const value_type& r) -> value_type
+                       {
+                           if (std::is_integral<value_type>::value)
+                           {
+                               return ((long long)l + r) / 2;
+                           }
+                           else
+                           {
+                               return (l + r) / 2;
+                           }
+                       });
+        batch_type res = avg(batch_lhs(), batch_rhs());
+        INFO("avg");
+        CHECK_BATCH_EQ(res, expected);
+    }
+
     void test_horizontal_operations() const
     {
         // reduce_add
@@ -938,6 +958,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES)
         Test.test_abs();
     }
 
+    SUBCASE("avg")
+    {
+        Test.test_avg();
+    }
+
     SUBCASE("horizontal_operations")
     {
         Test.test_horizontal_operations();
diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp
@@ -1156,6 +1156,13 @@ struct xsimd_api_all_types_functions
         CHECK_EQ(extract(xsimd::add(T(val0), T(val1))), val0 + val1);
     }
 
+    void test_avg()
+    {
+        value_type val0(1);
+        value_type val1(3);
+        CHECK_EQ(extract(xsimd::avg(T(val0), T(val1))), (val0 + val1) / value_type(2));
+    }
+
     void test_decr()
     {
         value_type val0(1);

Original file line number	Diff line number	Diff line change
`@@ -1156,6 +1156,13 @@ struct xsimd_api_all_types_functions`
`1156`	`1156`	`CHECK_EQ(extract(xsimd::add(T(val0), T(val1))), val0 + val1);`
`1157`	`1157`	`}`
`1158`	`1158`
	`1159`	`+ void test_avg()`
	`1160`	`+ {`
	`1161`	`+ value_type val0(1);`
	`1162`	`+ value_type val1(3);`
	`1163`	`+ CHECK_EQ(extract(xsimd::avg(T(val0), T(val1))), (val0 + val1) / value_type(2));`
	`1164`	`+ }`
	`1165`	`+`
`1159`	`1166`	`void test_decr()`
`1160`	`1167`	`{`
`1161`	`1168`	`value_type val0(1);`