Add tiling object

stijnh · stijnh · commit 755a00eef4df · 2023-10-22T17:04:10.000+02:00
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(vector_add)
+add_subdirectory(vector_add_tiling)
diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
@@ -17,7 +17,7 @@ __global__ void my_kernel(int length, const khalf<N>* input, double constant, kf
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        output[i] = kf::cast<float>((input[i] * input[i]) * constant);
+        kf::cast_to(output[i]) = (input[i] * input[i]) * constant;
     }
 }
 
diff --git a/examples/vector_add_tiling/CMakeLists.txt b/examples/vector_add_tiling/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.17)
+
+set (PROJECT_NAME kernel_float_vecadd_tiling)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+set (CMAKE_CXX_STANDARD 17)
+
+add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
+target_link_libraries(${PROJECT_NAME} kernel_float)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+
+find_package(CUDA REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu
@@ -0,0 +1,97 @@
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include "kernel_float.h"
+#include "kernel_float/tiling.h"
+using namespace kernel_float::prelude;
+
+void cuda_check(cudaError_t code) {
+    if (code != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
+    }
+}
+
+template<int N, int B>
+__global__ void my_kernel(
+    int length,
+    kf::aligned_ptr<const __half> input,
+    double constant,
+    kf::aligned_ptr<float> output) {
+    auto tiling = kf::tiling<
+        kf::tile_factor<N>,
+        kf::block_size<B>,
+        kf::distributions<kf::dist::block_cyclic<2>>>();
+
+    auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
+    auto mask = tiling.local_mask();
+
+    auto a = kf::load(input.get(), points, mask);
+    auto b = (a * a) * constant;
+    kf::store(b, output.get(), points, mask);
+}
+
+template<int items_per_thread, int block_size = 256>
+void run_kernel(int n) {
+    double constant = 1.0;
+    std::vector<half> input(n);
+    std::vector<float> output_expected;
+    std::vector<float> output_result;
+
+    // Generate input data
+    for (int i = 0; i < n; i++) {
+        input[i] = half(i);
+        output_expected[i] = float(i + constant);
+    }
+
+    // Allocate device memory
+    __half* input_dev;
+    float* output_dev;
+    cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
+    cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
+
+    // Copy device memory
+    cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
+
+    // Launch kernel!
+    int items_per_block = block_size * items_per_thread;
+    int grid_size = (n + items_per_block - 1) / items_per_block;
+    my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
+        n,
+        kf::aligned_ptr(input_dev),
+        constant,
+        kf::aligned_ptr(output_dev));
+
+    // Copy results back
+    cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
+
+    // Check results
+    for (int i = 0; i < n; i++) {
+        float result = output_result[i];
+        float answer = output_expected[i];
+
+        if (result != answer) {
+            std::stringstream msg;
+            msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
+            throw std::runtime_error(msg.str());
+        }
+    }
+
+    cuda_check(cudaFree(input_dev));
+    cuda_check(cudaFree(output_dev));
+}
+
+int main() {
+    int n = 84000;  // divisible by 1, 2, 3, 4, 5, 6, 7, 8
+    cuda_check(cudaSetDevice(0));
+
+    run_kernel<1>(n);
+    run_kernel<2>(n);
+    run_kernel<3>(n);
+    run_kernel<4>(n);
+    run_kernel<8>(n);
+
+    std::cout << "result correct\n";
+    return EXIT_SUCCESS;
+}
diff --git a/include/kernel_float/tiling.h b/include/kernel_float/tiling.h
diff --git a/tests/tiling.cu b/tests/tiling.cu

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`add_subdirectory(vector_add)`
	`2`	`+add_subdirectory(vector_add_tiling)`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ __global__ void my_kernel(int length, const khalf<N>* input, double constant, kf`
`17`	`17`	`int i = blockIdx.x * blockDim.x + threadIdx.x;`
`18`	`18`
`19`	`19`	`if (i * N < length) {`
`20`		`- output[i] = kf::cast<float>((input[i] * input[i]) * constant);`
	`20`	`+ kf::cast_to(output[i]) = (input[i] * input[i]) * constant;`
`21`	`21`	`}`
`22`	`22`	`}`
`23`	`23`