forked from NVIDIA/CUDALibrarySamples
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnvrtc_fft_thread.cu
More file actions
162 lines (136 loc) · 5.82 KB
/
nvrtc_fft_thread.cu
File metadata and controls
162 lines (136 loc) · 5.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include <vector>
#include <iostream>
#include <string>
#include <algorithm>
#include <memory>
#include <nvrtc.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include "common.hpp"
#include "common_nvrtc.hpp"
const char* thread_fft_kernel = R"kernel(
#include <cufftdx.hpp>
using namespace cufftdx;
// FFT
using size_desc = Size<FFT_SIZE>;
using dir_desc = Direction<fft_direction::inverse>;
using type_desc = Type<fft_type::c2c>;
using FFT = decltype(size_desc() + dir_desc() + type_desc() + Thread() + Precision<double>());
extern "C" __global__ void thread_fft_kernel(typename FFT::value_type *data)
{
// Local array for thread
typename FFT::value_type thread_data[FFT::storage_size];
// Load data from global memory to registers.
// thread_data should have all input data in order.
unsigned int index = threadIdx.x * FFT::elements_per_thread;
for (size_t i = 0; i < FFT::elements_per_thread; i++) {
thread_data[i] = data[index + i];
}
// Execute FFT
FFT().execute(thread_data);
// Save results
for (size_t i = 0; i < FFT::elements_per_thread; i++) {
data[index + i] = thread_data[i];
}
}
)kernel";
int main(int, char**) {
// Note that FFT description is only defined in kernel
static constexpr unsigned int fft_size = 16;
using value_type = double2; // or cuDoubleComplex, or cuda::std::complex<double>
// Add FFT_SIZE definition to opts
std::string fft_size_definition = "-DFFT_SIZE=" + std::to_string(fft_size);
// Get current device
int current_device;
CUDA_CHECK_AND_EXIT(cudaGetDevice(¤t_device));
// Get architecture of current device
std::string gpu_architecture_option = example::nvrtc::get_device_architecture_option(current_device);
// Create a program
nvrtcProgram program;
NVRTC_SAFE_CALL(nvrtcCreateProgram(&program, // program
thread_fft_kernel, // buffer
"thread_fft_kernel.cu", // name
0, // numHeaders
NULL, // headers
NULL)); // includeNames
// Prepare compilation options
std::vector<const char*> opts = {
"--std=c++17",
"--device-as-default-execution-space",
"--include-path=" CUDA_INCLUDE_DIR // Add path to CUDA include directory
};
// Parse cuFFTDx include dirs
std::vector<std::string> cufftdx_include_dirs = example::nvrtc::get_cufftdx_include_dirs();
// Add cuFFTDx include dirs to opts
for (auto& d : cufftdx_include_dirs) {
opts.push_back(d.c_str());
}
// Add FFT_SIZE definition
opts.push_back(fft_size_definition.c_str());
// Add gpu-architecture flag
opts.push_back(gpu_architecture_option.c_str());
nvrtcResult compileResult = nvrtcCompileProgram(program, // program
static_cast<int>(opts.size()), // numOptions
opts.data()); // options
// Obtain compilation log from the program
if (compileResult != NVRTC_SUCCESS) {
example::nvrtc::print_program_log(program);
std::exit(1);
}
// Obtain PTX from the program.
size_t ptx_size;
NVRTC_SAFE_CALL(nvrtcGetPTXSize(program, &ptx_size));
auto ptx = std::make_unique<char[]>(ptx_size);
NVRTC_SAFE_CALL(nvrtcGetPTX(program, ptx.get()));
// Destroy the program.
NVRTC_SAFE_CALL(nvrtcDestroyProgram(&program));
// Load the generated PTX and get a handle to the thread_fft_kernel
CUcontext context;
CUmodule module;
CUfunction kernel;
CUDA_CHECK_AND_EXIT(cudaFree(0)); // Initialize CUDA context
CU_CHECK_AND_EXIT(cuCtxGetCurrent(&context)); // Get current context
CU_CHECK_AND_EXIT(cuModuleLoadDataEx(&module, ptx.get(), 0, 0, 0));
CU_CHECK_AND_EXIT(cuModuleGetFunction(&kernel, module, "thread_fft_kernel"));
// Generate input for execution
std::vector<value_type> host_input(fft_size);
float i = 0.0f;
for (auto& v : host_input) {
v.x = i++;
v.y = 0;
}
size_t fft_buffer_size = fft_size * sizeof(value_type);
void* device_values;
CUDA_CHECK_AND_EXIT(cudaMalloc(&device_values, fft_buffer_size));
CUDA_CHECK_AND_EXIT(cudaMemcpy(device_values, host_input.data(), fft_buffer_size, cudaMemcpyHostToDevice));
// Execute thread_fft_kernel
void* args[] = {&device_values};
CU_CHECK_AND_EXIT(cuLaunchKernel(kernel,
1, // number of blocks
1,
1,
1, // number of threads
1,
1,
0, // no shared memory
NULL, // NULL stream
args,
0));
CU_CHECK_AND_EXIT(cuCtxSynchronize());
// Retrieve and print output.
std::vector<value_type> host_output(fft_size);
CUDA_CHECK_AND_EXIT(cudaMemcpy(host_output.data(), device_values, fft_buffer_size, cudaMemcpyDeviceToHost));
for (size_t i = 0; i < fft_size; ++i) {
std::cout << i << ": (" << host_output[i].x << ", " << host_output[i].y << ")" << std::endl;
}
// Release resources.
CUDA_CHECK_AND_EXIT(cudaFree(device_values));
CU_CHECK_AND_EXIT(cuModuleUnload(module));
double expected_value = (fft_size * (fft_size + 1)) / 2;
if ((host_output[0].x - expected_value) > 0.01) {
std::cout << "Failed" << std::endl;
return 1;
}
std::cout << "Success" << std::endl;
return 0;
}