KernelTuner
diff --git a/‎include/kernel_float.h‎
Lines changed: 1 addition & 0 deletions b/‎include/kernel_float.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/kernel_float/bf16.h‎
Lines changed: 68 additions & 88 deletions b/‎include/kernel_float/bf16.h‎
Lines changed: 68 additions & 88 deletions
diff --git a/‎include/kernel_float/binops.h‎
Lines changed: 63 additions & 52 deletions b/‎include/kernel_float/binops.h‎
Lines changed: 63 additions & 52 deletions
@@ -3,6 +3,7 @@
 
 #include "kernel_float/bf16.h"
 #include "kernel_float/binops.h"
+#include "kernel_float/cast.h"
 #include "kernel_float/fp16.h"
 #include "kernel_float/fp8.h"
 #include "kernel_float/interface.h"
 
@@ -6,94 +6,95 @@
 #if KERNEL_FLOAT_BF16_AVAILABLE
 #include <cuda_bf16.h>
 
+#include "binops.h"
+#include "cast.h"
 #include "interface.h"
+#include "storage.h"
+#include "unops.h"
 
 namespace kernel_float {
+KERNEL_FLOAT_DEFINE_COMMON_TYPE(__nv_bfloat16, bool)
 KERNEL_FLOAT_DEFINE_COMMON_TYPE(float, __nv_bfloat16)
 KERNEL_FLOAT_DEFINE_COMMON_TYPE(double, __nv_bfloat16)
 
-struct vector_bfloat16x2 {
-    static_assert(sizeof(__nv_bfloat16) * 2 == sizeof(__nv_bfloat162), "invalid size");
-    static_assert(alignof(__nv_bfloat16) <= alignof(__nv_bfloat162), "invalid alignment");
-
-    KERNEL_FLOAT_INLINE vector_bfloat16x2(__nv_bfloat16 v = {}) noexcept : vector_ {v, v} {}
-    KERNEL_FLOAT_INLINE vector_bfloat16x2(__nv_bfloat16 x, __nv_bfloat16 y) noexcept :
-        vector_ {x, y} {}
-    KERNEL_FLOAT_INLINE vector_bfloat16x2(__nv_bfloat162 xy) noexcept : vector_ {xy} {}
-
-    KERNEL_FLOAT_INLINE operator __nv_bfloat162() const noexcept {
-        return vector_;
-    }
-
-    KERNEL_FLOAT_INLINE __nv_bfloat16 get(const_index<0>) const {
-        return vector_.x;
-    }
-
-    KERNEL_FLOAT_INLINE __nv_bfloat16 get(const_index<1>) const {
-        return vector_.y;
-    }
+template<>
+struct vector_traits<__nv_bfloat162> {
+    using value_type = __nv_bfloat16;
+    static constexpr size_t size = 2;
 
-    KERNEL_FLOAT_INLINE void set(const_index<0>, __nv_bfloat16 v) {
-        *this = vector_bfloat16x2(v, get(const_index<1> {}));
+    KERNEL_FLOAT_INLINE
+    static __nv_bfloat162 fill(__nv_bfloat16 value) {
+#if KERNEL_FLOAT_ON_DEVICE
+        return __bfloat162bfloat162(value);
+#else
+        return {value, value};
+#endif
     }
 
-    KERNEL_FLOAT_INLINE void set(const_index<1>, __nv_bfloat16 v) {
-        *this = vector_bfloat16x2(get(const_index<0> {}), v);
+    KERNEL_FLOAT_INLINE
+    static __nv_bfloat162 create(__nv_bfloat16 low, __nv_bfloat16 high) {
+#if KERNEL_FLOAT_ON_DEVICE
+        return __halves2bfloat162(low, high);
+#else
+        return {low, high};
+#endif
     }
 
-    KERNEL_FLOAT_INLINE __nv_bfloat16 get(size_t index) const {
+    KERNEL_FLOAT_INLINE
+    static __nv_bfloat16 get(__nv_bfloat162 self, size_t index) {
+#if KERNEL_FLOAT_ON_DEVICE
         if (index == 0) {
-            return get(const_index<0> {});
+            return __low2bfloat16(self);
         } else {
-            return get(const_index<1> {});
+            return __high2bfloat16(self);
         }
+#else
+        if (index == 0) {
+            return self.x;
+        } else {
+            return self.y;
+        }
+#endif
     }
 
-    KERNEL_FLOAT_INLINE void set(size_t index, __nv_bfloat16 value) const {
+    KERNEL_FLOAT_INLINE
+    static void set(__nv_bfloat162& self, size_t index, __nv_bfloat16 value) {
         if (index == 0) {
-            set(const_index<0> {}, value);
+            self.x = value;
         } else {
-            set(const_index<1> {}, value);
+            self.y = value;
         }
     }
-
-  private:
-    __nv_bfloat162 vector_;
 };
 
-template<>
-struct vector_traits<vector_bfloat16x2>:
-    default_vector_traits<vector_bfloat16x2, __nv_bfloat16, 2> {};
-
-template<>
-struct vector_traits<__nv_bfloat16>: vector_traits<vector_scalar<__nv_bfloat16>> {};
-
-template<>
-struct vector_traits<__nv_bfloat162>: vector_traits<vector_bfloat16x2> {};
+template<size_t N>
+struct default_storage<__nv_bfloat16, N, Alignment::Maximum, enabled_t<(N >= 2)>> {
+    using type = nested_array<__nv_bfloat162, N>;
+};
 
-template<>
-struct default_vector_storage<__nv_bfloat16, 2> {
-    using type = vector_bfloat16x2;
+template<size_t N>
+struct default_storage<__nv_bfloat16, N, Alignment::Packed, enabled_t<(N >= 2 && N % 2 == 0)>> {
+    using type = nested_array<__nv_bfloat162, N>;
 };
 
 #if KERNEL_FLOAT_ON_DEVICE
-#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                                   \
-    namespace ops {                                                                     \
-    template<>                                                                          \
-    struct NAME<__nv_bfloat16> {                                                        \
-        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) {             \
-            return FUN1(input);                                                         \
-        }                                                                               \
-    };                                                                                  \
-    }                                                                                   \
-    namespace detail {                                                                  \
-    template<>                                                                          \
-    struct map_helper<ops::NAME<__nv_bfloat16>, vector_bfloat16x2, vector_bfloat16x2> { \
-        KERNEL_FLOAT_INLINE static __nv_bfloat162                                       \
-        call(ops::NAME<__nv_bfloat16>, const __nv_bfloat162& input) {                   \
-            return FUN2(input);                                                         \
-        }                                                                               \
-    };                                                                                  \
+#define KERNEL_FLOAT_BF16_UNARY_FUN(NAME, FUN1, FUN2)                             \
+    namespace ops {                                                               \
+    template<>                                                                    \
+    struct NAME<__nv_bfloat16> {                                                  \
+        KERNEL_FLOAT_INLINE __nv_bfloat16 operator()(__nv_bfloat16 input) {       \
+            return FUN1(input);                                                   \
+        }                                                                         \
+    };                                                                            \
+    }                                                                             \
+    namespace detail {                                                            \
+    template<>                                                                    \
+    struct map_helper<ops::NAME<__nv_bfloat16>, __nv_bfloat162, __nv_bfloat162> { \
+        KERNEL_FLOAT_INLINE static __nv_bfloat162                                 \
+        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 input) {                    \
+            return FUN2(input);                                                   \
+        }                                                                         \
+    };                                                                            \
     }
 
 KERNEL_FLOAT_BF16_UNARY_FUN(abs, ::__habs, ::__habs2);
@@ -123,13 +124,9 @@ KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc);
     }                                                                                             \
     namespace detail {                                                                            \
     template<>                                                                                    \
-    struct zip_helper<                                                                            \
-        ops::NAME<__nv_bfloat16>,                                                                 \
-        vector_bfloat16x2,                                                                        \
-        vector_bfloat16x2,                                                                        \
-        vector_bfloat16x2> {                                                                      \
+    struct zip_helper<ops::NAME<__nv_bfloat16>, __nv_bfloat162, __nv_bfloat162, __nv_bfloat162> { \
         KERNEL_FLOAT_INLINE static __nv_bfloat162                                                 \
-        call(ops::NAME<__nv_bfloat16>, const __nv_bfloat162& left, const __nv_bfloat162& right) { \
+        call(ops::NAME<__nv_bfloat16>, __nv_bfloat162 left, __nv_bfloat162 right) {               \
             return FUN2(left, right);                                                             \
         }                                                                                         \
     };                                                                                            \
@@ -197,27 +194,10 @@ KERNEL_FLOAT_BF16_CAST(
     (unsigned long)(__bfloat162ull_rz(input)));
 KERNEL_FLOAT_BF16_CAST(unsigned long long, __ull2bfloat16_rn(input), __bfloat162ull_rz(input));
 
-namespace detail {
-template<>
-struct map_helper<ops::cast<__nv_bfloat16, float>, vector_storage<float, 2>, vector_bfloat16x2> {
-    KERNEL_FLOAT_INLINE static vector_storage<float, 2>
-    call(ops::cast<__nv_bfloat16, float>, __nv_bfloat162 input) noexcept {
-        return __bfloat1622float2(input);
-    }
-};
-
-template<>
-struct map_helper<ops::cast<float, __nv_bfloat16>, vector_bfloat16x2, vector_storage<float, 2>> {
-    KERNEL_FLOAT_INLINE static vector_bfloat16x2
-    call(ops::cast<float, __nv_bfloat16>, const vector_storage<float, 2>& input) noexcept {
-        return __float22bfloat162_rn(input);
-    }
-};
-}  // namespace detail
-
 using bfloat16 = __nv_bfloat16;
-KERNEL_FLOAT_TYPE_ALIAS(bf16x, __nv_bfloat16)
-KERNEL_FLOAT_TYPE_ALIAS(bfloat16x, __nv_bfloat16)
+//KERNEL_FLOAT_TYPE_ALIAS(half, __nv_bfloat16)
+//KERNEL_FLOAT_TYPE_ALIAS(float16x, __nv_bfloat16)
+//KERNEL_FLOAT_TYPE_ALIAS(f16x, __nv_bfloat16)
 
 }  // namespace kernel_float
 
 
@@ -15,28 +15,25 @@ struct zip_helper {
     template<size_t... Is>
     KERNEL_FLOAT_INLINE static Output
     call_with_indices(F fun, const Left& left, const Right& right, index_sequence<Is...> = {}) {
-        return Output {fun(left.get(const_index<Is> {}), right.get(const_index<Is> {}))...};
+        return vector_traits<Output>::create(fun(vector_get<Is>(left), vector_get<Is>(right))...);
     }
 };
 
-template<typename F, typename T, typename L, typename R, size_t N>
-struct zip_helper<F, vector_compound<T, N>, vector_compound<L, N>, vector_compound<R, N>> {
-    KERNEL_FLOAT_INLINE static vector_compound<T, N>
-    call(F fun, const vector_compound<L, N>& left, const vector_compound<R, N>& right) {
-        static constexpr size_t low_size = vector_compound<T, N>::low_size;
-        static constexpr size_t high_size = vector_compound<T, N>::high_size;
-
-        return {
-            zip_helper<
-                F,
-                vector_storage<T, low_size>,
-                vector_storage<L, low_size>,
-                vector_storage<R, low_size>>::call(fun, left.low(), right.low()),
-            zip_helper<
-                F,
-                vector_storage<T, high_size>,
-                vector_storage<L, high_size>,
-                vector_storage<R, high_size>>::call(fun, left.high(), right.high())};
+template<typename F, typename V, size_t N>
+struct zip_helper<F, nested_array<V, N>, nested_array<V, N>, nested_array<V, N>> {
+    KERNEL_FLOAT_INLINE static nested_array<V, N>
+    call(F fun, const nested_array<V, N>& left, const nested_array<V, N>& right) {
+        return call(fun, left, right, make_index_sequence<nested_array<V, N>::num_packets> {});
+    }
+
+  private:
+    template<size_t... Is>
+    KERNEL_FLOAT_INLINE static nested_array<V, N> call(
+        F fun,
+        const nested_array<V, N>& left,
+        const nested_array<V, N>& right,
+        index_sequence<Is...>) {
+        return {zip_helper<F, V, V, V>::call(fun, left[Is], right[Is])...};
     }
 };
 };  // namespace detail
@@ -48,7 +45,7 @@ template<typename... Ts>
 static constexpr size_t common_vector_size = common_size<vector_size<Ts>...>;
 
 template<typename F, typename L, typename R>
-using zip_type = vector_storage<
+using zip_type = default_storage_type<
     result_t<F, vector_value_type<L>, vector_value_type<R>>,
     common_vector_size<L, R>>;
 
@@ -63,16 +60,19 @@ using zip_type = vector_storage<
  * ``zip_common`` for that functionality.
  */
 template<typename F, typename Left, typename Right, typename Output = zip_type<F, Left, Right>>
-KERNEL_FLOAT_INLINE Output zip(F fun, Left&& left, Right&& right) {
+KERNEL_FLOAT_INLINE vector<Output> zip(F fun, Left&& left, Right&& right) {
     static constexpr size_t N = vector_size<Output>;
-    return detail::zip_helper<F, Output, into_vector_type<Left>, into_vector_type<Right>>::call(
+    using LeftInput = default_storage_type<vector_value_type<Left>, N>;
+    using RightInput = default_storage_type<vector_value_type<Right>, N>;
+
+    return detail::zip_helper<F, Output, LeftInput, RightInput>::call(
         fun,
-        broadcast<N>(std::forward<Left>(left)),
-        broadcast<N>(std::forward<Right>(right)));
+        broadcast<LeftInput, Left>(std::forward<Left>(left)),
+        broadcast<RightInput, Right>(std::forward<Right>(right)));
 }
 
 template<typename F, typename L, typename R>
-using zip_common_type = vector_storage<
+using zip_common_type = default_storage_type<
     result_t<F, common_vector_value_type<L, R>, common_vector_value_type<L, R>>,
     common_vector_size<L, R>>;
 
@@ -99,38 +99,50 @@ template<
     typename Left,
     typename Right,
     typename Output = zip_common_type<F, Left, Right>>
-KERNEL_FLOAT_INLINE Output zip_common(F fun, Left&& left, Right&& right) {
+KERNEL_FLOAT_INLINE vector<Output> zip_common(F fun, Left&& left, Right&& right) {
     static constexpr size_t N = vector_size<Output>;
     using C = common_t<vector_value_type<Left>, vector_value_type<Right>>;
+    using Input = default_storage_type<C, N>;
 
-    return detail::zip_helper<F, Output, vector_storage<C, N>, vector_storage<C, N>>::call(
+    return detail::zip_helper<F, Output, Input, Input>::call(
         fun,
-        broadcast<C, N>(std::forward<Left>(left)),
-        broadcast<C, N>(std::forward<Right>(right)));
+        broadcast<Input, Left>(std::forward<Left>(left)),
+        broadcast<Input, Right>(std::forward<Right>(right)));
 }
 
-#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR)                                             \
-    namespace ops {                                                                        \
-    template<typename T>                                                                   \
-    struct NAME {                                                                          \
-        KERNEL_FLOAT_INLINE T operator()(T left, T right) {                                \
-            return T(EXPR);                                                                \
-        }                                                                                  \
-    };                                                                                     \
-    }                                                                                      \
-    template<typename L, typename R, typename C = common_vector_value_type<L, R>>          \
-    KERNEL_FLOAT_INLINE zip_common_type<ops::NAME<C>, L, R> NAME(L&& left, R&& right) {    \
-        return zip_common(ops::NAME<C> {}, std::forward<L>(left), std::forward<R>(right)); \
+#define KERNEL_FLOAT_DEFINE_BINARY(NAME, EXPR)                                                  \
+    namespace ops {                                                                             \
+    template<typename T>                                                                        \
+    struct NAME {                                                                               \
+        KERNEL_FLOAT_INLINE T operator()(T left, T right) {                                     \
+            return T(EXPR);                                                                     \
+        }                                                                                       \
+    };                                                                                          \
+    }                                                                                           \
+    template<typename L, typename R, typename C = common_vector_value_type<L, R>>               \
+    KERNEL_FLOAT_INLINE vector<zip_common_type<ops::NAME<C>, L, R>> NAME(L&& left, R&& right) { \
+        return zip_common(ops::NAME<C> {}, std::forward<L>(left), std::forward<R>(right));      \
     }
 
-#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP)                                                \
-    KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right)                                            \
-    template<                                                                                  \
-        typename L,                                                                            \
-        typename R,                                                                            \
-        typename C = enabled_t<is_vector<L> || is_vector<R>, common_vector_value_type<L, R>>>  \
-    KERNEL_FLOAT_INLINE zip_common_type<ops::NAME<C>, L, R> operator OP(L&& left, R&& right) { \
-        return zip_common(ops::NAME<C> {}, std::forward<L>(left), std::forward<R>(right));     \
+#define KERNEL_FLOAT_DEFINE_BINARY_OP(NAME, OP)                                   \
+    KERNEL_FLOAT_DEFINE_BINARY(NAME, left OP right)                               \
+    template<typename L, typename R, typename C = common_vector_value_type<L, R>> \
+    KERNEL_FLOAT_INLINE vector<zip_common_type<ops::NAME<C>, L, R>> operator OP(  \
+        const vector<L>& left,                                                    \
+        const vector<R>& right) {                                                 \
+        return zip_common(ops::NAME<C> {}, left, right);                          \
+    }                                                                             \
+    template<typename L, typename R, typename C = common_vector_value_type<L, R>> \
+    KERNEL_FLOAT_INLINE vector<zip_common_type<ops::NAME<C>, L, R>> operator OP(  \
+        const vector<L>& left,                                                    \
+        const R& right) {                                                         \
+        return zip_common(ops::NAME<C> {}, left, right);                          \
+    }                                                                             \
+    template<typename L, typename R, typename C = common_vector_value_type<L, R>> \
+    KERNEL_FLOAT_INLINE vector<zip_common_type<ops::NAME<C>, L, R>> operator OP(  \
+        const L& left,                                                            \
+        const vector<R>& right) {                                                 \
+        return zip_common(ops::NAME<C> {}, left, right);                          \
     }
 
 KERNEL_FLOAT_DEFINE_BINARY_OP(add, +)
@@ -153,7 +165,6 @@ KERNEL_FLOAT_DEFINE_BINARY_OP(bit_xor, ^)
 // clang-format off
 template<template<typename T> typename F, typename L, typename R>
 static constexpr bool vector_assign_allowed =
-    is_vector<L> &&
     common_vector_size<L, R> == vector_size<L> &&
     is_implicit_convertible<
         result_t<
@@ -170,9 +181,9 @@ static constexpr bool vector_assign_allowed =
         typename L,                                                                           \
         typename R,                                                                           \
         typename T = enabled_t<vector_assign_allowed<ops::NAME, L, R>, vector_value_type<L>>> \
-    KERNEL_FLOAT_INLINE L& operator OP(L& lhs, R&& rhs) {                                     \
+    KERNEL_FLOAT_INLINE vector<L>& operator OP(vector<L>& lhs, const R& rhs) {                \
         using F = ops::NAME<T>;                                                               \
-        lhs = zip_common<F, L&, R, into_vector_type<L>>(F {}, lhs, std::forward<R>(rhs));     \
+        lhs = zip_common<F, const L&, const R&, L>(F {}, lhs.storage(), rhs);                 \
         return lhs;                                                                           \
     }