|
1 | 1 | # Kernel Launcher |
2 | 2 |
|
3 | | -_Kernel Launcher_ is a header-only C++11 library that can load the results for a CUDA kernel tuned by [Kernel Tuner](https://github.com/benvanwerkhoven/kernel_tuner), dynamically compile the optimal kernel configuration for the current CUDA device (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)), and call the kernel in type-safe way using C++ magic. |
| 3 | + |
4 | 4 |
|
| 5 | +_Kernel Launcher_ is a C++ library that makes it easy to dynamically compile _CUDA_ kernels at run time (using [NVRTC](https://docs.nvidia.com/cuda/nvrtc/index.html)) and call them in an easy type-safe way using C++ magic. |
| 6 | +Additionally, _Kernel Launcher_ supports exporting kernel specifications, such that they can be tuned by [Kernel Tuner](https://github.com/benvanwerkhoven/kernel_tuner), and importing the tuning results, known as _wisdom_ files. |
5 | 7 |
|
6 | | -## Installation |
7 | | -Move the `include` and `thirdparty` directories into you project and include these directories. Link the final binary with `nvrtc`, `cuda`, and `cudart`. |
8 | | - |
9 | | -``` |
10 | | -gcc source.cpp -Iinclude/ -Ithirdparty/nlohmann/json/single_include -lnvrtc -lcuda -lcudart -std=c++11 |
11 | | -``` |
12 | 8 |
|
13 | | -## Example |
14 | 9 |
|
15 | | -See `examples` for example on how to use this library. |
| 10 | +## Installation |
16 | 11 |
|
| 12 | +Recommended installation is using CMake. See the [installation guide](https://kerneltuner.github.io/kernel_launcher/install.html). |
17 | 13 |
|
18 | | -First, load a kernel configuration: |
| 14 | +## Example |
19 | 15 |
|
20 | | -``` |
21 | | -using namespace kernel_launcher; |
| 16 | +See the documentation for [examples](https://kerneltuner.github.io/kernel_launcher/example.html) or check out the [examples](https://github.com/KernelTuner/kernel_launcher/tree/master/examples) directory. |
22 | 17 |
|
23 | | -// Load optimal configuration for given device and problem size: |
24 | | -auto config = Config::load_best("tuning_results.json", "1000x1000", "Titan_X"); |
| 18 | +```cpp |
| 19 | +#include "kernel_launcher.h" |
25 | 20 |
|
26 | | -// Load optimal configuration for current device (set using cudaSetDevice). |
27 | | -auto config = Config::load_best_for_current_device("tuning_results.json", "1000x1000"); |
28 | | -``` |
| 21 | +int main() { |
| 22 | + // Namespace alias. |
| 23 | + namespace kl = kernel_launcher; |
29 | 24 |
|
| 25 | + // Create a kernel builder |
| 26 | + kl::KernelBuilder builder("vector_add", "vector_add_kernel.cu"); |
30 | 27 |
|
31 | | -Next, define the kernel in C++ and compile it a run-time. |
| 28 | + // Define the variables that can be tuned for this kernel. |
| 29 | + kl::ParamExpr threads_per_block = builder.tune("block_size", {32, 64, 128, 256, 512, 1024}); |
| 30 | + kl::ParamExpr elements_per_thread = builder.tune("elements_per_thread", {1, 2, 4, 8}); |
32 | 31 |
|
33 | | -``` |
34 | | -// Define the argument types for the given kernel. It is convenient to do this using a typedef. |
35 | | -using VectorAddKernel = CudaKernel<int, float*, float*, float*>; |
| 32 | + // Set kernel properties such as block size, grid divisor, template arguments, etc. |
| 33 | + builder |
| 34 | + .block_size(threads_per_block) |
| 35 | + .grid_divisors(threads_per_block * elements_per_thread) |
| 36 | + .template_args(kl::type_of<float>()) |
| 37 | + .define("ELEMENTS_PER_THREAD", elements_per_thread); |
36 | 38 |
|
37 | | -// Compile the kernel for the given configuration. |
38 | | -auto kernel = VectorAddKernel::compile(config, "vector_add.cu"); |
| 39 | + // Define the kernel |
| 40 | + kl::WisdomKernel vector_add_kernel("vector_add", builder); |
39 | 41 |
|
40 | | -// Or load + compile in one single line. |
41 | | -auto kernel = VectorAddKernel::compile_best_for_current_device("tuning_results.json", "1000x1000", "vector_add.cu"); |
42 | | -``` |
| 42 | + // Initialize CUDA memory. This is outside the scope of kernel_launcher. |
| 43 | + unsigned int n = 1000000; |
| 44 | + float *dev_A, *dev_B, *dev_C; |
| 45 | + /* cudaMalloc, cudaMemcpy, ... */ |
43 | 46 |
|
44 | | -Finaly, call the kernel: |
| 47 | + // Launch the kernel! Note that kernel is compiled on the first call. |
| 48 | + // The grid size and block size do not need to be specified, they are |
| 49 | + // derived from the kernel specifications and problem size. |
| 50 | + unsigned int problem_size = n; |
| 51 | + vector_add_kernel(problem_size)(n, dev_C, dev_A, dev_B); |
| 52 | +} |
45 | 53 |
|
46 | 54 | ``` |
47 | | -// Get tuned thread block size |
48 | | -dim3 block_dim = kernel.get_config().get_block_dim(); // Or: kernel.get_block_dim() |
49 | | -dim3 grid_dim = n / block_dim.x; |
50 | | -
|
51 | 55 |
|
52 | | -// Launch kernel synchronously. |
53 | | -kernel.configure(grid_dim).launch(n, a, b, c); |
| 56 | +## License |
54 | 57 |
|
55 | | -// Or use syntactic suger: |
56 | | -kernel(grid_dim)(n, a, b, c); |
57 | | -
|
58 | | -
|
59 | | -// Launch kernel asynchronously. |
60 | | -kernel.configure_async(grid_dim, smem_size, stream).launch(n, a, b, c); |
61 | | -
|
62 | | -// Or use syntactic suger: |
63 | | -kernel(grid_dim, smem_size, stream)(n, a, b, c); |
64 | | -``` |
| 58 | +Licensed under Apache 2.0. See [LICENSE](https://github.com/KernelTuner/kernel_launcher/blob/master/LICENSE). |
65 | 59 |
|
66 | 60 |
|
67 | 61 | ## Related Work |
|
0 commit comments