Skip to content

Commit 755a00e

Browse files
committed
Add tiling object
1 parent f40ff58 commit 755a00e

File tree

6 files changed

+719
-1
lines changed

6 files changed

+719
-1
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
add_subdirectory(vector_add)
2+
add_subdirectory(vector_add_tiling)

examples/vector_add/main.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ __global__ void my_kernel(int length, const khalf<N>* input, double constant, kf
1717
int i = blockIdx.x * blockDim.x + threadIdx.x;
1818

1919
if (i * N < length) {
20-
output[i] = kf::cast<float>((input[i] * input[i]) * constant);
20+
kf::cast_to(output[i]) = (input[i] * input[i]) * constant;
2121
}
2222
}
2323

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
cmake_minimum_required(VERSION 3.17)
2+
3+
set (PROJECT_NAME kernel_float_vecadd_tiling)
4+
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5+
set (CMAKE_CXX_STANDARD 17)
6+
7+
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
8+
target_link_libraries(${PROJECT_NAME} kernel_float)
9+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
10+
11+
find_package(CUDA REQUIRED)
12+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})

examples/vector_add_tiling/main.cu

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
#include <iostream>
2+
#include <sstream>
3+
#include <stdexcept>
4+
#include <vector>
5+
6+
#include "kernel_float.h"
7+
#include "kernel_float/tiling.h"
8+
using namespace kernel_float::prelude;
9+
10+
void cuda_check(cudaError_t code) {
11+
if (code != cudaSuccess) {
12+
throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(code));
13+
}
14+
}
15+
16+
template<int N, int B>
17+
__global__ void my_kernel(
18+
int length,
19+
kf::aligned_ptr<const __half> input,
20+
double constant,
21+
kf::aligned_ptr<float> output) {
22+
auto tiling = kf::tiling<
23+
kf::tile_factor<N>,
24+
kf::block_size<B>,
25+
kf::distributions<kf::dist::block_cyclic<2>>>();
26+
27+
auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
28+
auto mask = tiling.local_mask();
29+
30+
auto a = kf::load(input.get(), points, mask);
31+
auto b = (a * a) * constant;
32+
kf::store(b, output.get(), points, mask);
33+
}
34+
35+
template<int items_per_thread, int block_size = 256>
36+
void run_kernel(int n) {
37+
double constant = 1.0;
38+
std::vector<half> input(n);
39+
std::vector<float> output_expected;
40+
std::vector<float> output_result;
41+
42+
// Generate input data
43+
for (int i = 0; i < n; i++) {
44+
input[i] = half(i);
45+
output_expected[i] = float(i + constant);
46+
}
47+
48+
// Allocate device memory
49+
__half* input_dev;
50+
float* output_dev;
51+
cuda_check(cudaMalloc(&input_dev, sizeof(__half) * n));
52+
cuda_check(cudaMalloc(&output_dev, sizeof(float) * n));
53+
54+
// Copy device memory
55+
cuda_check(cudaMemcpy(input_dev, input.data(), sizeof(half) * n, cudaMemcpyDefault));
56+
57+
// Launch kernel!
58+
int items_per_block = block_size * items_per_thread;
59+
int grid_size = (n + items_per_block - 1) / items_per_block;
60+
my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
61+
n,
62+
kf::aligned_ptr(input_dev),
63+
constant,
64+
kf::aligned_ptr(output_dev));
65+
66+
// Copy results back
67+
cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
68+
69+
// Check results
70+
for (int i = 0; i < n; i++) {
71+
float result = output_result[i];
72+
float answer = output_expected[i];
73+
74+
if (result != answer) {
75+
std::stringstream msg;
76+
msg << "error: index " << i << " is incorrect: " << result << " != " << answer;
77+
throw std::runtime_error(msg.str());
78+
}
79+
}
80+
81+
cuda_check(cudaFree(input_dev));
82+
cuda_check(cudaFree(output_dev));
83+
}
84+
85+
int main() {
86+
int n = 84000; // divisible by 1, 2, 3, 4, 5, 6, 7, 8
87+
cuda_check(cudaSetDevice(0));
88+
89+
run_kernel<1>(n);
90+
run_kernel<2>(n);
91+
run_kernel<3>(n);
92+
run_kernel<4>(n);
93+
run_kernel<8>(n);
94+
95+
std::cout << "result correct\n";
96+
return EXIT_SUCCESS;
97+
}

0 commit comments

Comments
 (0)