KernelTuner
diff --git a/‎examples/vector_add/main.cu‎
Lines changed: 11 additions & 5 deletions b/‎examples/vector_add/main.cu‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/vector_add_tiling/main.cu‎
Lines changed: 2 additions & 2 deletions b/‎examples/vector_add_tiling/main.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/kernel_float/macros.h‎
Lines changed: 19 additions & 9 deletions b/‎include/kernel_float/macros.h‎
Lines changed: 19 additions & 9 deletions
@@ -13,11 +13,13 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N>
-__global__ void my_kernel(int length, const khalf<N>* input, double constant, kfloat<N>* output) {
+__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        kf::cast_to(output[i]) = (input[i] * input[i]) * constant;
+        auto a = kf::read_aligned<N>(input + i * N);
+        auto b = (a * a) * constant;
+        kf::write_aligned(output + i * N, b);
     }
 }
 
@@ -35,8 +37,8 @@ void run_kernel(int n) {
     }
 
     // Allocate device memory
-    khalf<items_per_thread>* input_dev;
-    kfloat<items_per_thread>* output_dev;
+    __half* input_dev;
+    float* output_dev;
     cuda_check(cudaMalloc(&input_dev, sizeof(half) * n));
     cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
 
@@ -47,7 +49,11 @@ void run_kernel(int n) {
     int block_size = 256;
     int items_per_block = block_size * items_per_thread;
     int grid_size = (n + items_per_block - 1) / items_per_block;
-    my_kernel<items_per_thread><<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
+    my_kernel<items_per_thread><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
 
     // Copy results back
     cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
 
@@ -27,9 +27,9 @@ __global__ void my_kernel(
     auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
     auto mask = tiling.local_mask();
 
-    auto a = kf::load(input.get(), points, mask);
+    auto a = input.read(points, mask);
     auto b = (a * a) * constant;
-    kf::store(b, output.get(), points, mask);
+    output.write(points, b, mask);
 }
 
 template<int items_per_thread, int block_size = 256>
 
@@ -9,35 +9,35 @@
 #define KERNEL_FLOAT_IS_DEVICE (1)
 #define KERNEL_FLOAT_IS_HOST   (0)
 #define KERNEL_FLOAT_CUDA_ARCH (__CUDA_ARCH__)
-#else
+#else  // __CUDA_ARCH__
 #define KERNEL_FLOAT_INLINE    __forceinline__ __host__
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
-#else
+#endif  // __CUDA_ARCH__
+#else  // __CUDACC__
 #define KERNEL_FLOAT_INLINE    inline
 #define KERNEL_FLOAT_CUDA      (0)
 #define KERNEL_FLOAT_IS_HOST   (1)
 #define KERNEL_FLOAT_IS_DEVICE (0)
 #define KERNEL_FLOAT_CUDA_ARCH (0)
-#endif
+#endif  // __CUDACC__
 
 #ifndef KERNEL_FLOAT_FP16_AVAILABLE
 #define KERNEL_FLOAT_FP16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_FP16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_BF16_AVAILABLE
 #define KERNEL_FLOAT_BF16_AVAILABLE (1)
-#endif
+#endif  // KERNEL_FLOAT_BF16_AVAILABLE
 
 #ifndef KERNEL_FLOAT_FP8_AVAILABLE
 #ifdef __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (__CUDACC_VER_MAJOR__ >= 12)
-#else
+#else  // __CUDACC_VER_MAJOR__
 #define KERNEL_FLOAT_FP8_AVAILABLE (0)
-#endif
-#endif
+#endif  // __CUDACC_VER_MAJOR__
+#endif  // KERNEL_FLOAT_FP8_AVAILABLE
 
 #define KERNEL_FLOAT_ASSERT(expr) \
     do {                          \
@@ -49,4 +49,14 @@
 #define KERNEL_FLOAT_CONCAT(A, B)      KERNEL_FLOAT_CONCAT_IMPL(A, B)
 #define KERNEL_FLOAT_CALL(F, ...)      F(__VA_ARGS__)
 
+// TOOD: check if this way is support across all compilers
+#if defined(__has_builtin) && __has_builtin(__builtin_assume_aligned) && 0
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) \
+    static_cast<TYPE*>(__builtin_assume_aligned(static_cast<TYPE*>(PTR), (ALIGNMENT)))
+#else
+#define KERNEL_FLOAT_ASSUME_ALIGNED(TYPE, PTR, ALIGNMENT) (PTR)
+#endif
+
+#define KERNEL_FLOAT_MAX_ALIGNMENT (32)
+
 #endif  //KERNEL_FLOAT_MACROS_H