KernelTuner
diff --git a/‎include/kernel_float/binops.h‎
Lines changed: 34 additions & 3 deletions b/‎include/kernel_float/binops.h‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎include/kernel_float/fp16.h‎
Lines changed: 0 additions & 1 deletion b/‎include/kernel_float/fp16.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/kernel_float/reduce.h‎
Lines changed: 13 additions & 6 deletions b/‎include/kernel_float/reduce.h‎
Lines changed: 13 additions & 6 deletions
@@ -172,15 +172,46 @@ KERNEL_FLOAT_DEFINE_BINARY_ASSIGN_OP(bit_xor, ^=)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(min)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(max)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(copysign)
-KERNEL_FLOAT_DEFINE_BINARY_FUN(hypot)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(modf)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(nextafter)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(pow)
 KERNEL_FLOAT_DEFINE_BINARY_FUN(remainder)
 
-#if KERNEL_FLOAT_CUDA_DEVICE
-KERNEL_FLOAT_DEFINE_BINARY_FUN(rhypot)
+KERNEL_FLOAT_DEFINE_BINARY(hypot, (ops::sqrt<T>()(left * left + right * right)))
+KERNEL_FLOAT_DEFINE_BINARY(rhypot, (T(1) / ops::hypot<T>()(left, right)))
+
+namespace ops {
+template<>
+struct hypot<double> {
+    KERNEL_FLOAT_INLINE double operator()(double left, double right) {
+        return ::hypot(left, right);
+    };
+};
+
+template<>
+struct hypot<float> {
+    KERNEL_FLOAT_INLINE float operator()(float left, float right) {
+        return ::hypotf(left, right);
+    };
+};
+
+// rhypot is only support on the GPU
+#if KERNEL_FLOAT_IS_DEVICE
+template<>
+struct rhypot<double> {
+    KERNEL_FLOAT_INLINE double operator()(double left, double right) {
+        return ::rhypot(left, right);
+    };
+};
+
+template<>
+struct rhypot<float> {
+    KERNEL_FLOAT_INLINE float operator()(float left, float right) {
+        return ::rhypotf(left, right);
+    };
+};
 #endif
+};  // namespace ops
 
 #if KERNEL_FLOAT_IS_DEVICE
 #define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN)     \
 
@@ -199,7 +199,6 @@ KERNEL_FLOAT_FP16_BINARY_FUN(multiply, __hmul, __hmul2)
 KERNEL_FLOAT_FP16_BINARY_FUN(divide, __hdiv, __h2div)
 KERNEL_FLOAT_FP16_BINARY_FUN(min, __hmin, __hmin2)
 KERNEL_FLOAT_FP16_BINARY_FUN(max, __hmax, __hmax2)
-
 KERNEL_FLOAT_FP16_BINARY_FUN(fast_div, __hdiv, __h2div)
 
 KERNEL_FLOAT_FP16_BINARY_FUN(equal_to, __heq, __heq2)
 
@@ -144,7 +144,14 @@ template<typename T, size_t N>
 struct dot_impl {
     KERNEL_FLOAT_INLINE
     static T call(const vector_storage<T, N>& left, const vector_storage<T, N>& right) {
-        return sum(zip(ops::multiply<T> {}, left, right));
+        vector_storage<T, N> intermediate;
+        detail::apply_impl<ops::multiply<T>, N, T, T, T>::call(
+            ops::multiply<T>(),
+            intermediate.data(),
+            left.data(),
+            right.data());
+
+        return detail::reduce_impl<ops::add<T>, N, T>::call(ops::add<T>(), intermediate.data());
     }
 };
 }  // namespace detail
@@ -197,25 +204,25 @@ template<typename T>
 struct magnitude_impl<T, 2> {
     KERNEL_FLOAT_INLINE
     static T call(const vector_storage<T, 2>& input) {
-        return ops::hypot<T> {}(input.data()[0], input.data()[1]);
+        return ops::hypot<T>()(input.data()[0], input.data()[1]);
     }
 };
 
-// The 3-argument overload of hypot is only available from C++17
-#ifdef __cpp_lib_hypot
+// The 3-argument overload of hypot is only available on host from C++17
+#if defined(__cpp_lib_hypot) && KERNEL_FLOAT_IS_HOST
 template<>
 struct magnitude_impl<float, 3> {
     KERNEL_FLOAT_INLINE
     static float call(const vector_storage<float, 3>& input) {
-        return std::hypot(input.data()[0], input.data()[1], input.data()[2]);
+        return ::hypot(input.data()[0], input.data()[1], input.data()[2]);
     }
 };
 
 template<>
 struct magnitude_impl<double, 3> {
     KERNEL_FLOAT_INLINE
     static float call(const vector_storage<double, 3>& input) {
-        return std::hypot(input.data()[0], input.data()[1], input.data()[2]);
+        return ::hypot(input.data()[0], input.data()[1], input.data()[2]);
     }
 };
 #endif