Use raw pointers in apply_impl and reduce_impl

stijnh · stijnh · commit 7acff4cc2414 · 2023-09-19T17:22:04.000+02:00
diff --git a/include/kernel_float/bf16.h b/include/kernel_float/bf16.h
@@ -49,66 +49,55 @@ struct zip_bfloat16x2 {
 
 template<typename F, size_t N>
 struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N>
-    call(F fun, const vector_storage<__nv_bfloat16, N>& input) {
-        vector_storage<__nv_bfloat16, N> result;
-
+    KERNEL_FLOAT_INLINE static void call(F fun, __nv_bfloat16* result, const __nv_bfloat16* input) {
 #pragma unroll
-        for (size_t i = 0; i + 2 <= N; i += 2) {
-            __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
             __nv_bfloat162 b = map_bfloat16x2<F>::call(fun, a);
-            result.data()[i + 0] = b.x;
-            result.data()[i + 1] = b.y;
+            result[2 * i + 0] = b.x;
+            result[2 * i + 1] = b.y;
         }
 
         if (N % 2 != 0) {
-            result.data()[N - 1] = fun(input.data()[N - 1]);
+            result[N - 1] = fun(input[N - 1]);
         }
-
-        return result;
     }
 };
 
 template<typename F, size_t N>
 struct apply_impl<F, N, __nv_bfloat16, __nv_bfloat16, __nv_bfloat16> {
-    KERNEL_FLOAT_INLINE static vector_storage<__nv_bfloat16, N> call(
-        F fun,
-        const vector_storage<__nv_bfloat16, N>& left,
-        const vector_storage<__nv_bfloat16, N>& right) {
-        vector_storage<__nv_bfloat16, N> result;
+    KERNEL_FLOAT_INLINE static void
+    call(F fun, __nv_bfloat16* result, const __nv_bfloat16* left, const __nv_bfloat16* right) {
 #pragma unroll
-        for (size_t i = 0; i + 2 <= N; i += 2) {
-            __nv_bfloat162 a = {left.data()[i], left.data()[i + 1]};
-            __nv_bfloat162 b = {right.data()[i], right.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __nv_bfloat162 a = {left[2 * i], left[2 * i + 1]};
+            __nv_bfloat162 b = {right[2 * i], right[2 * i + 1]};
             __nv_bfloat162 c = zip_bfloat16x2<F>::call(fun, a, b);
-            result.data()[i + 0] = c.x;
-            result.data()[i + 1] = c.y;
+            result[2 * i + 0] = c.x;
+            result[2 * i + 1] = c.y;
         }
 
         if (N % 2 != 0) {
-            result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]);
+            result[N - 1] = fun(left[N - 1], right[N - 1]);
         }
-
-        return result;
     }
 };
 
 template<typename F, size_t N>
 struct reduce_impl<F, N, __nv_bfloat16, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __nv_bfloat16
-    call(F fun, const vector_storage<__nv_bfloat16, N>& input) {
-        __nv_bfloat162 accum = {input.data()[0], input.data()[1]};
+    KERNEL_FLOAT_INLINE static __nv_bfloat16 call(F fun, const __nv_bfloat16* input) {
+        __nv_bfloat162 accum = {input[0], input[1]};
 
 #pragma unroll
-        for (size_t i = 2; i + 2 <= N; i += 2) {
-            __nv_bfloat162 a = {input.data()[i], input.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __nv_bfloat162 a = {input[2 * i], input[2 * i + 1]};
             accum = zip_bfloat16x2<F>::call(fun, accum, a);
         }
 
         __nv_bfloat16 result = fun(accum.x, accum.y);
 
         if (N % 2 != 0) {
-            result = fun(result, input.data()[N - 1]);
+            result = fun(result, input[N - 1]);
         }
 
         return result;
@@ -126,6 +115,7 @@ struct reduce_impl<F, N, __nv_bfloat16, enable_if_t<(N >= 2)>> {
     };                                                                      \
     }
 
+// There operations are not implemented in half precision, so they are forward to single precision
 KERNEL_FLOAT_BF16_UNARY_FORWARD(tan)
 KERNEL_FLOAT_BF16_UNARY_FORWARD(asin)
 KERNEL_FLOAT_BF16_UNARY_FORWARD(acos)
@@ -243,32 +233,22 @@ KERNEL_FLOAT_BF16_BINARY_FUN(greater_equal, __hge, __hgt2)
 KERNEL_FLOAT_BF16_CAST(double, __double2bfloat16(input), double(__bfloat162float(input)));
 KERNEL_FLOAT_BF16_CAST(float, __float2bfloat16(input), __bfloat162float(input));
 
+// clang-format off
 // there are no official char casts. Instead, cast to int and then to char
 KERNEL_FLOAT_BF16_CAST(char, __int2bfloat16_rn(input), (char)__bfloat162int_rz(input));
-KERNEL_FLOAT_BF16_CAST(
-    signed char,
-    __int2bfloat16_rn(input),
-    (signed char)__bfloat162int_rz(input));
-KERNEL_FLOAT_BF16_CAST(
-    unsigned char,
-    __int2bfloat16_rn(input),
-    (unsigned char)__bfloat162int_rz(input));
+KERNEL_FLOAT_BF16_CAST(signed char, __int2bfloat16_rn(input), (signed char)__bfloat162int_rz(input));
+KERNEL_FLOAT_BF16_CAST(unsigned char, __int2bfloat16_rn(input), (unsigned char)__bfloat162int_rz(input));
 
 KERNEL_FLOAT_BF16_CAST(signed short, __bfloat162short_rz(input), __short2bfloat16_rn(input));
 KERNEL_FLOAT_BF16_CAST(signed int, __bfloat162int_rz(input), __int2bfloat16_rn(input));
-KERNEL_FLOAT_BF16_CAST(
-    signed long,
-    __ll2bfloat16_rn(input),
-    (signed long)(__bfloat162ll_rz(input)));
+KERNEL_FLOAT_BF16_CAST(signed long, __ll2bfloat16_rn(input), (signed long)(__bfloat162ll_rz(input)));
 KERNEL_FLOAT_BF16_CAST(signed long long, __ll2bfloat16_rn(input), __bfloat162ll_rz(input));
 
 KERNEL_FLOAT_BF16_CAST(unsigned short, __bfloat162ushort_rz(input), __ushort2bfloat16_rn(input));
 KERNEL_FLOAT_BF16_CAST(unsigned int, __bfloat162uint_rz(input), __uint2bfloat16_rn(input));
-KERNEL_FLOAT_BF16_CAST(
-    unsigned long,
-    __ull2bfloat16_rn(input),
-    (unsigned long)(__bfloat162ull_rz(input)));
+KERNEL_FLOAT_BF16_CAST(unsigned long, __ull2bfloat16_rn(input), (unsigned long)(__bfloat162ull_rz(input)));
 KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input));
+// clang-format on
 
 using bfloat16 = __nv_bfloat16;
 //KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16)
diff --git a/include/kernel_float/binops.h b/include/kernel_float/binops.h
@@ -29,11 +29,16 @@ KERNEL_FLOAT_INLINE zip_type<F, L, R> zip(F fun, const L& left, const R& right)
     using B = vector_value_type<R>;
     using O = result_t<F, A, B>;
     using E = broadcast_vector_extent_type<L, R>;
+    vector_storage<O, E::value> result;
 
-    return detail::apply_impl<F, E::value, O, A, B>::call(
+    detail::apply_impl<F, E::value, O, A, B>::call(
         fun,
-        detail::broadcast_impl<A, vector_extent_type<L>, E>::call(into_vector_storage(left)),
-        detail::broadcast_impl<B, vector_extent_type<R>, E>::call(into_vector_storage(right)));
+        result.data(),
+        detail::broadcast_impl<A, vector_extent_type<L>, E>::call(into_vector_storage(left)).data(),
+        detail::broadcast_impl<B, vector_extent_type<R>, E>::call(into_vector_storage(right))
+            .data());
+
+    return result;
 }
 
 template<typename F, typename L, typename R>
@@ -60,12 +65,19 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
     using O = result_t<F, T, T>;
     using E = broadcast_vector_extent_type<L, R>;
 
-    return detail::apply_impl<F, E::value, O, T, T>::call(
+    vector_storage<O, E::value> result;
+
+    detail::apply_impl<F, E::value, O, T, T>::call(
         fun,
+        result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
-            into_vector_storage(left)),
+            into_vector_storage(left))
+            .data(),
         detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
-            into_vector_storage(right)));
+            into_vector_storage(right))
+            .data());
+
+    return result;
 }
 
 #define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR)                                             \
diff --git a/include/kernel_float/conversion.h b/include/kernel_float/conversion.h
@@ -178,8 +178,8 @@ struct convert_impl {
     KERNEL_FLOAT_INLINE
     static vector_storage<T2, E2::value> call(vector_storage<T, E::value> input) {
         using F = ops::cast<T, T2, M>;
-        vector_storage<T2, E::value> intermediate =
-            detail::apply_impl<F, E::value, T2, T>::call(F {}, input);
+        vector_storage<T2, E::value> intermediate;
+        detail::apply_impl<F, E::value, T2, T>::call(F {}, intermediate.data(), input.data());
         return detail::broadcast_impl<T2, E, E2>::call(intermediate);
     }
 };
@@ -208,7 +208,10 @@ struct convert_impl<T, E, T2, E, M> {
     KERNEL_FLOAT_INLINE
     static vector_storage<T2, E::value> call(vector_storage<T, E::value> input) {
         using F = ops::cast<T, T2, M>;
-        return detail::apply_impl<F, E::value, T2, T>::call(F {}, input);
+
+        vector_storage<T2, E::value> result;
+        detail::apply_impl<F, E::value, T2, T>::call(F {}, result.data(), input.data());
+        return result;
     }
 };
 }  // namespace detail
diff --git a/include/kernel_float/fp16.h b/include/kernel_float/fp16.h
@@ -47,63 +47,55 @@ struct zip_halfx2 {
 
 template<typename F, size_t N>
 struct apply_impl<F, N, __half, __half> {
-    KERNEL_FLOAT_INLINE static vector_storage<__half, N>
-    call(F fun, const vector_storage<__half, N>& input) {
-        vector_storage<__half, N> result;
-
+    KERNEL_FLOAT_INLINE static void call(F fun, __half* result, const __half* input) {
 #pragma unroll
-        for (size_t i = 0; i + 2 <= N; i += 2) {
-            __half2 a = {input.data()[i], input.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __half2 a = {input[2 * i], input[2 * i + 1]};
             __half2 b = map_halfx2<F>::call(fun, a);
-            result.data()[i + 0] = b.x;
-            result.data()[i + 1] = b.y;
+            result[2 * i + 0] = b.x;
+            result[2 * i + 1] = b.y;
         }
 
         if (N % 2 != 0) {
-            result.data()[N - 1] = fun(input.data()[N - 1]);
+            result[N - 1] = fun(input[N - 1]);
         }
-
-        return result;
     }
 };
 
 template<typename F, size_t N>
 struct apply_impl<F, N, __half, __half, __half> {
-    KERNEL_FLOAT_INLINE static vector_storage<__half, N>
-    call(F fun, const vector_storage<__half, N>& left, const vector_storage<__half, N>& right) {
-        vector_storage<__half, N> result;
+    KERNEL_FLOAT_INLINE static void
+    call(F fun, __half* result, const __half* left, const __half* right) {
 #pragma unroll
-        for (size_t i = 0; i + 2 <= N; i += 2) {
-            __half2 a = {left.data()[i], left.data()[i + 1]};
-            __half2 b = {right.data()[i], right.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __half2 a = {left[2 * i], left[2 * i + 1]};
+            __half2 b = {right[2 * i], right[2 * i + 1]};
             __half2 c = zip_halfx2<F>::call(fun, a, b);
-            result.data()[i + 0] = c.x;
-            result.data()[i + 1] = c.y;
+            result[2 * i + 0] = c.x;
+            result[2 * i + 1] = c.y;
         }
 
         if (N % 2 != 0) {
-            result.data()[N - 1] = fun(left.data()[N - 1], right.data()[N - 1]);
+            result[N - 1] = fun(left[N - 1], right[N - 1]);
         }
-
-        return result;
     }
 };
 
 template<typename F, size_t N>
 struct reduce_impl<F, N, __half, enable_if_t<(N >= 2)>> {
-    KERNEL_FLOAT_INLINE static __half call(F fun, const vector_storage<__half, N>& input) {
-        __half2 accum = {input.data()[0], input.data()[1]};
+    KERNEL_FLOAT_INLINE static __half call(F fun, const __half* input) {
+        __half2 accum = {input[0], input[1]};
 
 #pragma unroll
-        for (size_t i = 2; i + 2 <= N; i += 2) {
-            __half2 a = {input.data()[i], input.data()[i + 1]};
+        for (size_t i = 0; 2 * i + 1 < N; i++) {
+            __half2 a = {input[2 * i], input[2 * i + 1]};
             accum = zip_halfx2<F>::call(fun, accum, a);
         }
 
         __half result = fun(accum.x, accum.y);
 
         if (N % 2 != 0) {
-            result = fun(result, input.data()[N - 1]);
+            result = fun(result, input[N - 1]);
         }
 
         return result;
@@ -122,6 +114,7 @@ struct reduce_impl<F, N, __half, enable_if_t<(N >= 2)>> {
     };                                                        \
     }
 
+// There operations are not implemented in half precision, so they are forward to single precision
 KERNEL_FLOAT_FP16_UNARY_FORWARD(tan)
 KERNEL_FLOAT_FP16_UNARY_FORWARD(asin)
 KERNEL_FLOAT_FP16_UNARY_FORWARD(acos)
diff --git a/include/kernel_float/reduce.h b/include/kernel_float/reduce.h
@@ -7,18 +7,17 @@ namespace kernel_float {
 namespace detail {
 template<typename F, size_t N, typename T, typename = void>
 struct reduce_impl {
-    KERNEL_FLOAT_INLINE static T call(F fun, const vector_storage<T, N>& input) {
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input) {
         return call(fun, input, make_index_sequence<N> {});
     }
 
   private:
     template<size_t... Is>
-    KERNEL_FLOAT_INLINE static T
-    call(F fun, const vector_storage<T, N>& input, index_sequence<0, Is...>) {
-        T result = input.data()[0];
+    KERNEL_FLOAT_INLINE static T call(F fun, const T* input, index_sequence<0, Is...>) {
+        T result = input[0];
 #pragma unroll
         for (size_t i = 1; i < N; i++) {
-            result = fun(result, input.data()[i]);
+            result = fun(result, input[i]);
         }
         return result;
     }
@@ -43,7 +42,7 @@ template<typename F, typename V>
 KERNEL_FLOAT_INLINE vector_value_type<V> reduce(F fun, const V& input) {
     return detail::reduce_impl<F, vector_extent<V>, vector_value_type<V>>::call(
         fun,
-        into_vector_storage(input));
+        into_vector_storage(input).data());
 }
 
 /**
diff --git a/include/kernel_float/triops.h b/include/kernel_float/triops.h
@@ -39,15 +39,22 @@ template<
     typename E = broadcast_vector_extent_type<C, L, R>>
 KERNEL_FLOAT_INLINE vector<T, E> where(const C& cond, const L& true_values, const R& false_values) {
     using F = ops::conditional<T>;
+    vector_storage<T, E::value> result;
 
-    return detail::apply_impl<F, E::value, T, bool, T, T>::call(
+    detail::apply_impl<F, E::value, T, bool, T, T>::call(
         F {},
+        result.data(),
         detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, bool, E>::call(
-            into_vector_storage(cond)),
+            into_vector_storage(cond))
+            .data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
-            into_vector_storage(true_values)),
+            into_vector_storage(true_values))
+            .data(),
         detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
-            into_vector_storage(false_values)));
+            into_vector_storage(false_values))
+            .data());
+
+    return result;
 }
 
 /**
@@ -117,15 +124,22 @@ template<
     typename E = broadcast_vector_extent_type<A, B, C>>
 KERNEL_FLOAT_INLINE vector<T, E> fma(const A& a, const B& b, const C& c) {
     using F = ops::fma<T>;
+    vector_storage<T, E::value> result;
 
-    return detail::apply_impl<F, E::value, T, T, T, T>::call(
+    detail::apply_impl<F, E::value, T, T, T, T>::call(
         F {},
+        result.data(),
         detail::convert_impl<vector_value_type<A>, vector_extent_type<A>, T, E>::call(
-            into_vector_storage(a)),
+            into_vector_storage(a))
+            .data(),
         detail::convert_impl<vector_value_type<B>, vector_extent_type<B>, T, E>::call(
-            into_vector_storage(b)),
+            into_vector_storage(b))
+            .data(),
         detail::convert_impl<vector_value_type<C>, vector_extent_type<C>, T, E>::call(
-            into_vector_storage(c)));
+            into_vector_storage(c))
+            .data());
+
+    return result;
 }
 
 }  // namespace kernel_float
diff --git a/include/kernel_float/unops.h b/include/kernel_float/unops.h
diff --git a/tests/binops.cu b/tests/binops.cu