Generalize implementation of dot_impl to remove fp16 specialization

stijnh · stijnh · commit 4278106f2a14 · 2024-07-22T11:20:20.000+02:00
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
@@ -226,55 +226,6 @@ using bfloat16 = __nv_bfloat16;
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16)
 
-#if KERNEL_FLOAT_CUDA_ARCH >= 800
-namespace detail {
-template<>
-struct dot_impl<__nv_bfloat16, 0> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __nv_bfloat16(0);
-    }
-};
-
-template<>
-struct dot_impl<__nv_bfloat16, 1> {
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__nv_bfloat16, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __nv_bfloat16 call(const __nv_bfloat16* left, const __nv_bfloat16* right) {
-        __nv_bfloat162 first_a = {left[0], left[1]};
-        __nv_bfloat162 first_b = {right[0], right[1]};
-        __nv_bfloat162 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 1 < N; i += 2) {
-            __nv_bfloat162 a = {left[i], left[i + 1]};
-            __nv_bfloat162 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __nv_bfloat16 result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __nv_bfloat16 a = left[N - 1];
-            __nv_bfloat16 b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #if KERNEL_FLOAT_FP16_AVAILABLE
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
@@ -174,55 +174,6 @@ using half = __half;
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __half)
 //KERNEL_FLOAT_TYPE_ALIAS(f16x, __half)
 
-#if KERNEL_FLOAT_IS_DEVICE
-namespace detail {
-template<>
-struct dot_impl<__half, 0> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __half(0);
-    }
-};
-
-template<>
-struct dot_impl<__half, 1> {
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        return __hmul(left[0], right[0]);
-    }
-};
-
-template<size_t N>
-struct dot_impl<__half, N> {
-    static_assert(N >= 2, "internal error");
-
-    KERNEL_FLOAT_INLINE
-    static __half call(const __half* left, const __half* right) {
-        __half2 first_a = {left[0], left[1]};
-        __half2 first_b = {right[0], right[1]};
-        __half2 accum = __hmul2(first_a, first_b);
-
-#pragma unroll
-        for (size_t i = 2; i + 2 <= N; i += 2) {
-            __half2 a = {left[i], left[i + 1]};
-            __half2 b = {right[i], right[i + 1]};
-            accum = __hfma2(a, b, accum);
-        }
-
-        __half result = __hadd(accum.x, accum.y);
-
-        if (N % 2 != 0) {
-            __half a = left[N - 1];
-            __half b = right[N - 1];
-            result = __hfma(a, b, result);
-        }
-
-        return result;
-    }
-};
-}  // namespace detail
-#endif
-
 }  // namespace kernel_float
 
 #endif
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
@@ -2,6 +2,7 @@
 #define KERNEL_FLOAT_REDUCE_H
 
 #include "binops.h"
+#include "triops.h"
 
 namespace kernel_float {
 namespace detail {
@@ -177,14 +178,38 @@ template<typename T, size_t N>
 struct dot_impl {
     KERNEL_FLOAT_INLINE
     static T call(const T* left, const T* right) {
-        vector_storage<T, N> intermediate;
-        detail::map_impl<ops::multiply<T>, N, T, T, T>::call(
-            ops::multiply<T>(),
-            intermediate.data(),
-            left,
-            right);
-
-        return detail::reduce_impl<ops::add<T>, N, T>::call(ops::add<T>(), intermediate.data());
+        static constexpr size_t K = preferred_vector_size<T>::value;
+        T result = {};
+
+        if constexpr (N / K > 0) {
+            T accum[K] = {T {}};
+            apply_impl<ops::multiply<T>, K, T, T, T>::call({}, accum, left, right);
+
+#pragma unroll
+            for (size_t i = 1; i < N / K; i++) {
+                apply_impl<ops::fma<T>, K, T, T, T, T>::call(
+                    ops::fma<T> {},
+                    accum,
+                    left + i * K,
+                    right + i * K,
+                    accum);
+            }
+
+            result = reduce_impl<ops::add<T>, K, T>::call({}, accum);
+        }
+
+        if constexpr (N % K > 0) {
+            for (size_t i = N - N % K; i < N; i++) {
+                apply_impl<ops::fma<T>, 1, T, T, T, T>::call(
+                    {},
+                    &result,
+                    left + i,
+                    right + i,
+                    &result);
+            }
+        }
+
+        return result;
     }
 };
 }  // namespace detail
diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h
@@ -92,11 +92,25 @@ namespace ops {
 template<typename T>
 struct fma {
     KERNEL_FLOAT_INLINE T operator()(T a, T b, T c) {
-        return a * b + c;
+        return ops::add<T> {}(ops::multiply<T> {}(a, b), c);
     }
 };
+}  // namespace ops
+
+namespace detail {
+template<typename T, size_t N>
+struct apply_impl<ops::fma<T>, N, T, T, T, T> {
+    KERNEL_FLOAT_INLINE
+    static void call(ops::fma<T>, T* output, const T* a, const T* b, const T* c) {
+        T temp[N];
+        apply_impl<ops::multiply<T>, N, T, T, T>::call({}, temp, a, b);
+        apply_impl<ops::add<T>, N, T, T, T>::call({}, output, temp, c);
+    }
+};
+}  // namespace detail
 
 #if KERNEL_FLOAT_IS_DEVICE
+namespace ops {
 template<>
 struct fma<float> {
     KERNEL_FLOAT_INLINE float operator()(float a, float b, float c) {
@@ -110,8 +124,8 @@ struct fma<double> {
         return __fma_rn(a, b, c);
     }
 };
-#endif
 }  // namespace ops
+#endif
 
 /**
  * Computes the result of `a * b + c`. This is done in a single operation if possible for the given vector type.
diff --git a/single_include/kernel_float.h b/single_include/kernel_float.h