-
Notifications
You must be signed in to change notification settings - Fork 112
Expand file tree
/
Copy pathkernel_fusion.cu
More file actions
45 lines (35 loc) · 1.34 KB
/
kernel_fusion.cu
File metadata and controls
45 lines (35 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#include "matx.h"
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
cudaStream_t stream = 0;
matx::cudaExecutor exec{stream};
// // manually set to log all NVTX levels
// MATX_NVTX_SET_LOG_LEVEL( matx::matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
matx::index_t size_x = 128;
matx::index_t size_y = 256;
auto A = matx::make_tensor<float>({size_x, size_y});
auto B = matx::make_tensor<float>({size_x, size_y});
auto C = matx::make_tensor<float>({size_x, size_y});
auto D = matx::make_tensor<float>({size_x, size_y});
auto result = matx::make_tensor<float>({size_x, size_y});
// run once to warm-up
(result = cos(C)).run(exec);
(result = result / D).run(exec);
(result = result * B).run(exec);
(A = B * cos(C)/D).run(exec);
cudaStreamSynchronize(stream);
for (int i = 0; i < 10; i++) {
// first individual, independent kernels
[[maybe_unused]] int unfused_range = MATX_NVTX_START_RANGE("Unfused Kernels");
(result = cos(C)).run(exec);
(result = result / D).run(exec);
(result = result * B).run(exec);
MATX_NVTX_END_RANGE(unfused_range);
// now, as a fused operation
[[maybe_unused]] int fused_range = MATX_NVTX_START_RANGE("Fused Operation");
(A = B * cos(C)/D).run(exec);
MATX_NVTX_END_RANGE(fused_range);
}
MATX_EXIT_HANDLER();
}