|
| 1 | +#include <iostream> |
| 2 | +#include <sstream> |
| 3 | +#include <stdexcept> |
| 4 | +#include <vector> |
| 5 | + |
| 6 | +#include "kernel_float.h" |
| 7 | +#include "kernel_float/tiling.h" |
| 8 | +using namespace kernel_float::prelude; |
| 9 | + |
| 10 | +void cuda_check(cudaError_t code) { |
| 11 | + if (code != cudaSuccess) { |
| 12 | + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code)); |
| 13 | + } |
| 14 | +} |
| 15 | + |
| 16 | +template<int N, int B> |
| 17 | +__global__ void my_kernel( |
| 18 | + int length, |
| 19 | + kf::aligned_ptr<const __half> input, |
| 20 | + double constant, |
| 21 | + kf::aligned_ptr<float> output) { |
| 22 | + auto tiling = kf::tiling< |
| 23 | + kf::tile_factor<N>, |
| 24 | + kf::block_size<B>, |
| 25 | + kf::distributions<kf::dist::block_cyclic<2>>>(); |
| 26 | + |
| 27 | + auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0); |
| 28 | + auto mask = tiling.local_mask(); |
| 29 | + |
| 30 | + auto a = kf::load(input.get(), points, mask); |
| 31 | + auto b = (a * a) * constant; |
| 32 | + kf::store(b, output.get(), points, mask); |
| 33 | +} |
| 34 | + |
| 35 | +template<int items_per_thread, int block_size = 256> |
| 36 | +void run_kernel(int n) { |
| 37 | + double constant = 1.0; |
| 38 | + std::vector<half> input(n); |
| 39 | + std::vector<float> output_expected; |
| 40 | + std::vector<float> output_result; |
| 41 | + |
| 42 | + // Generate input data |
| 43 | + for (int i = 0; i < n; i++) { |
| 44 | + input[i] = half(i); |
| 45 | + output_expected[i] = float(i + constant); |
| 46 | + } |
| 47 | + |
| 48 | + // Allocate device memory |
| 49 | + __half* input_dev; |
| 50 | + float* output_dev; |
| 51 | + cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n)); |
| 52 | + cuda_check(cudaMalloc(&output_dev, sizeof(float) * n)); |
| 53 | + |
| 54 | + // Copy device memory |
| 55 | + cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault)); |
| 56 | + |
| 57 | + // Launch kernel! |
| 58 | + int items_per_block = block_size * items_per_thread; |
| 59 | + int grid_size = (n + items_per_block - 1) / items_per_block; |
| 60 | + my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>( |
| 61 | + n, |
| 62 | + kf::aligned_ptr(input_dev), |
| 63 | + constant, |
| 64 | + kf::aligned_ptr(output_dev)); |
| 65 | + |
| 66 | + // Copy results back |
| 67 | + cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault)); |
| 68 | + |
| 69 | + // Check results |
| 70 | + for (int i = 0; i < n; i++) { |
| 71 | + float result = output_result[i]; |
| 72 | + float answer = output_expected[i]; |
| 73 | + |
| 74 | + if (result != answer) { |
| 75 | + std::stringstream msg; |
| 76 | + msg << "error: index " << i << " is incorrect: " << result << " != " << answer; |
| 77 | + throw std::runtime_error(msg.str()); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + cuda_check(cudaFree(input_dev)); |
| 82 | + cuda_check(cudaFree(output_dev)); |
| 83 | +} |
| 84 | + |
| 85 | +int main() { |
| 86 | + int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8 |
| 87 | + cuda_check(cudaSetDevice(0)); |
| 88 | + |
| 89 | + run_kernel<1>(n); |
| 90 | + run_kernel<2>(n); |
| 91 | + run_kernel<3>(n); |
| 92 | + run_kernel<4>(n); |
| 93 | + run_kernel<8>(n); |
| 94 | + |
| 95 | + std::cout << "result correct\n"; |
| 96 | + return EXIT_SUCCESS; |
| 97 | +} |
0 commit comments