KernelTuner
diff --git a/‎include/kernel_float/bf16.h‎
Lines changed: 39 additions & 23 deletions b/‎include/kernel_float/bf16.h‎
Lines changed: 39 additions & 23 deletions
diff --git a/‎include/kernel_float/binops.h‎
Lines changed: 8 additions & 8 deletions b/‎include/kernel_float/binops.h‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎include/kernel_float/conversion.h‎
Lines changed: 14 additions & 7 deletions b/‎include/kernel_float/conversion.h‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎include/kernel_float/fp16.h‎
Lines changed: 37 additions & 23 deletions b/‎include/kernel_float/fp16.h‎
Lines changed: 37 additions & 23 deletions
@@ -94,7 +94,7 @@ struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> {
 };
 
 template<typename F, size_t N>
-struct reduce_helper<F, N, __nv_bfloat16, enabled_t<(N >= 2)>> {
+struct reduce_impl<F, N, __nv_bfloat16, enable_if_t<(N >= 2)>> {
     KERNEL_FLOAT_INLINE static __nv_bfloat16
     call(F fun, const vector_storage<__nv_bfloat16, N>& input) {
         __nv_bfloat162 accum = {input.data()[0], input.data()[1]};
@@ -276,38 +276,54 @@ using bfloat16 = __nv_bfloat16;
 
 #if KERNEL_FLOAT_IS_DEVICE
 namespace detail {
+template<>
+struct dot_impl<__nv_bfloat16, 0> {
+    KERNEL_FLOAT_INLINE
+    static __nv_bfloat16 call(
+        const vector_storage<__nv_bfloat16, 0>& left,
+        const vector_storage<__nv_bfloat16, 0>& right) {
+        return __nv_bfloat16(0);
+    }
+};
+
+template<>
+struct dot_impl<__nv_bfloat16, 1> {
+    KERNEL_FLOAT_INLINE
+    static __nv_bfloat16 call(
+        const vector_storage<__nv_bfloat16, 1>& left,
+        const vector_storage<__nv_bfloat16, 1>& right) {
+        return __hmul(left.data()[0], right.data()[0]);
+    }
+};
+
 template<size_t N>
-struct dot_helper<__nv_bfloat16, N> {
+struct dot_impl<__nv_bfloat16, N> {
+    static_assert(N >= 2, "internal error");
+
     KERNEL_FLOAT_INLINE
     static __nv_bfloat16 call(
         const vector_storage<__nv_bfloat16, N>& left,
         const vector_storage<__nv_bfloat16, N>& right) {
-        if (N == 0) {
-            return __nv_bfloat16(0);
-        } else if (N == 1) {
-            return __hmul(left.data()[0], right.data()[0]);
-        } else {
-            __nv_bfloat162 first_a = {left.data()[0], left.data()[1]};
-            __nv_bfloat162 first_b = {right.data()[0], right.data()[1]};
-            __nv_bfloat162 accum = __hmul2(first_a, first_b);
+        __nv_bfloat162 first_a = {left.data()[0], left.data()[1]};
+        __nv_bfloat162 first_b = {right.data()[0], right.data()[1]};
+        __nv_bfloat162 accum = __hmul2(first_a, first_b);
 
 #pragma unroll
-            for (size_t i = 2; i + 2 <= N; i += 2) {
-                __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]};
-                __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]};
-                accum = __hfma2(a, b, accum);
-            }
-
-            __nv_bfloat16 result = __hadd(accum.x, accum.y);
+        for (size_t i = 2; i + 2 <= N; i += 2) {
+            __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]};
+            __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]};
+            accum = __hfma2(a, b, accum);
+        }
 
-            if (N % 2 != 0) {
-                __nv_bfloat16 a = left.data()[N - 1];
-                __nv_bfloat16 b = right.data()[N - 1];
-                result = __hfma(a, b, result);
-            }
+        __nv_bfloat16 result = __hadd(accum.x, accum.y);
 
-            return result;
+        if (N % 2 != 0) {
+            __nv_bfloat16 a = left.data()[N - 1];
+            __nv_bfloat16 b = right.data()[N - 1];
+            result = __hfma(a, b, result);
         }
+
+        return result;
     }
 };
 }  // namespace detail
 
@@ -49,9 +49,9 @@ using zip_common_type = vector<
  * Example
  * =======
  * ```
- * vec<int, 3> a = {1.0f, 2.0f, 3.0f};
+ * vec<float, 3> a = {1.0f, 2.0f, 3.0f};
  * vec<int, 3> b = {4, 5, 6};
- * vec<int, 3> c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f]
+ * vec<float, 3> c = zip_common([](float x, float y){ return x + y; }, a, b); // returns [5.0f, 7.0f, 9.0f]
  * ```
  */
 template<typename F, typename L, typename R>
@@ -62,9 +62,9 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
 
     return detail::apply_impl<F, E::value, O, T, T>::call(
         fun,
-        detail::convert_helper<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
+        detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
             into_vector_storage(left)),
-        detail::convert_helper<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
+        detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
             into_vector_storage(right)));
 }
 
@@ -139,7 +139,7 @@ static constexpr bool is_vector_assign_allowed =
         typename T,                                                                  \
         typename E,                                                                  \
         typename R,                                                                  \
-        typename = enabled_t<is_vector_assign_allowed<ops::NAME, T, E, R>>>          \
+        typename = enable_if_t<is_vector_assign_allowed<ops::NAME, T, E, R>>>        \
     KERNEL_FLOAT_INLINE vector<T, E>& operator OP(vector<T, E>& lhs, const R& rhs) { \
         using F = ops::NAME<T>;                                                      \
         lhs = zip_common(F {}, lhs, rhs);                                            \
@@ -249,7 +249,7 @@ struct bit_xor<double> {
 
 namespace detail {
 template<typename T>
-struct cross_helper {
+struct cross_impl {
     KERNEL_FLOAT_INLINE
     static vector<T, extent<3>>
     call(const vector_storage<T, 3>& av, const vector_storage<T, 3>& bv) {
@@ -275,9 +275,9 @@ template<
     typename R,
     typename T = promoted_vector_value_type<L, R>,
     typename =
-        enabled_t<is_vector_broadcastable<L, extent<3>> && is_vector_broadcastable<R, extent<3>>>>
+        enable_if_t<is_vector_broadcastable<L, extent<3>> && is_vector_broadcastable<R, extent<3>>>>
 KERNEL_FLOAT_INLINE vector<T, extent<3>> cross(const L& left, const R& right) {
-    return detail::cross_helper<T>::call(convert_storage<T, 3>(left), convert_storage<T, 3>(right));
+    return detail::cross_impl<T>::call(convert_storage<T, 3>(left), convert_storage<T, 3>(right));
 }
 
 }  // namespace kernel_float
 
@@ -99,7 +99,7 @@ template<typename... Vs>
 using broadcast_vector_extent_type = broadcast_extent<vector_extent_type<Vs>...>;
 
 template<typename From, typename To>
-static constexpr bool is_broadcastable = is_same<broadcast_extent<From, To>, To>;
+static constexpr bool is_broadcastable = is_same_type<broadcast_extent<From, To>, To>;
 
 template<typename V, typename To>
 static constexpr bool is_vector_broadcastable = is_broadcastable<vector_extent_type<V>, To>;
@@ -169,8 +169,12 @@ broadcast_like(const V& input, const R& other) {
 }
 
 namespace detail {
+/**
+ * Convert vector of element type `T` and extent type `E` to vector of element type `T2` and extent type `E2`.
+ *  Specialization exist for the cases where `T==T2` and/or `E==E2`.
+ */
 template<typename T, typename E, typename T2, typename E2, RoundingMode M = RoundingMode::ANY>
-struct convert_helper {
+struct convert_impl {
     KERNEL_FLOAT_INLINE
     static vector_storage<T2, E2::value> call(vector_storage<T, E::value> input) {
         using F = ops::cast<T, T2, M>;
@@ -180,24 +184,27 @@ struct convert_helper {
     }
 };
 
+// T == T2, E == E2
 template<typename T, typename E, RoundingMode M>
-struct convert_helper<T, E, T, E, M> {
+struct convert_impl<T, E, T, E, M> {
     KERNEL_FLOAT_INLINE
     static vector_storage<T, E::value> call(vector_storage<T, E::value> input) {
         return input;
     }
 };
 
+// T == T2, E != E2
 template<typename T, typename E, typename E2, RoundingMode M>
-struct convert_helper<T, E, T, E2, M> {
+struct convert_impl<T, E, T, E2, M> {
     KERNEL_FLOAT_INLINE
     static vector_storage<T, E2::value> call(vector_storage<T, E::value> input) {
         return detail::broadcast_impl<T, E, E2>::call(input);
     }
 };
 
+// T != T2, E == E2
 template<typename T, typename E, typename T2, RoundingMode M>
-struct convert_helper<T, E, T2, E, M> {
+struct convert_impl<T, E, T2, E, M> {
     KERNEL_FLOAT_INLINE
     static vector_storage<T2, E::value> call(vector_storage<T, E::value> input) {
         using F = ops::cast<T, T2, M>;
@@ -208,8 +215,8 @@ struct convert_helper<T, E, T2, E, M> {
 
 template<typename R, size_t N, RoundingMode M = RoundingMode::ANY, typename V>
 KERNEL_FLOAT_INLINE vector_storage<R, N> convert_storage(const V& input, extent<N> new_size = {}) {
-    return detail::convert_helper<vector_value_type<V>, vector_extent_type<V>, R, extent<N>, M>::
-        call(into_vector_storage(input));
+    return detail::convert_impl<vector_value_type<V>, vector_extent_type<V>, R, extent<N>, M>::call(
+        into_vector_storage(input));
 }
 
 /**
 
@@ -90,7 +90,7 @@ struct apply_impl<F, N, __half, __half, __half> {
 };
 
 template<typename F, size_t N>
-struct reduce_helper<F, N, __half, enabled_t<(N >= 2)>> {
+struct reduce_impl<F, N, __half, enable_if_t<(N >= 2)>> {
     KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) {
         __half2 accum = {input.data()[0], input.data()[1]};
 
@@ -256,37 +256,51 @@ using half = __half;
 
 #if KERNEL_FLOAT_IS_DEVICE
 namespace detail {
+template<>
+struct dot_impl<__half, 0> {
+    KERNEL_FLOAT_INLINE
+    static __half
+    call(const vector_storage<__half, 0>& left, const vector_storage<__half, 0>& right) {
+        return __half(0);
+    }
+};
+
+template<>
+struct dot_impl<__half, 1> {
+    KERNEL_FLOAT_INLINE
+    static __half
+    call(const vector_storage<__half, 1>& left, const vector_storage<__half, 1>& right) {
+        return __hmul(left.data()[0], right.data()[0]);
+    }
+};
+
 template<size_t N>
-struct dot_helper<__half, N> {
+struct dot_impl<__half, N> {
+    static_assert(N >= 2, "internal error");
+
     KERNEL_FLOAT_INLINE
     static __half
     call(const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) {
-        if (N == 0) {
-            return __half(0);
-        } else if (N == 1) {
-            return __hmul(left.data()[0], right.data()[0]);
-        } else {
-            __half2 first_a = {left.data()[0], left.data()[1]};
-            __half2 first_b = {right.data()[0], right.data()[1]};
-            __half2 accum = __hmul2(first_a, first_b);
+        __half2 first_a = {left.data()[0], left.data()[1]};
+        __half2 first_b = {right.data()[0], right.data()[1]};
+        __half2 accum = __hmul2(first_a, first_b);
 
 #pragma unroll
-            for (size_t i = 2; i + 2 <= N; i += 2) {
-                __half2 a = {left.data()[i], left.data()[i + 1]};
-                __half2 b = {right.data()[i], right.data()[i + 1]};
-                accum = __hfma2(a, b, accum);
-            }
-
-            __half result = __hadd(accum.x, accum.y);
+        for (size_t i = 2; i + 2 <= N; i += 2) {
+            __half2 a = {left.data()[i], left.data()[i + 1]};
+            __half2 b = {right.data()[i], right.data()[i + 1]};
+            accum = __hfma2(a, b, accum);
+        }
 
-            if (N % 2 != 0) {
-                __half a = left.data()[N - 1];
-                __half b = right.data()[N - 1];
-                result = __hfma(a, b, result);
-            }
+        __half result = __hadd(accum.x, accum.y);
 
-            return result;
+        if (N % 2 != 0) {
+            __half a = left.data()[N - 1];
+            __half b = right.data()[N - 1];
+            result = __hfma(a, b, result);
         }
+
+        return result;
     }
 };
 }  // namespace detail