Skip to content

Commit b327896

Browse files
committed
added gpu-side timings
1 parent 2826b25 commit b327896

File tree

10 files changed

+68
-13
lines changed

10 files changed

+68
-13
lines changed

include/shady/runtime.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ typedef struct Module_ Module;
3030

3131
Program* new_program_from_module(Runtime*, const CompilerConfig*, Module*);
3232

33-
Command* launch_kernel(Program*, Device*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
33+
typedef struct {
34+
uint64_t* profiled_gpu_time;
35+
} ExtraKernelOptions;
36+
37+
Command* launch_kernel(Program*, Device*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
3438
bool wait_completion(Command*);
3539

3640
Buffer* allocate_buffer_device(Device*, size_t);

samples/aobench/ao_main.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,19 @@ void render_device(Args* args, TEXEL_T *img, int w, int h, int nsubsamples, Stri
132132
Program* program = new_program_from_module(runtime, &args->compiler_config, m);
133133

134134
// run it twice to compile everything and benefit from caches
135-
wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }));
135+
wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }, NULL));
136136
struct timespec ts;
137137
timespec_get(&ts, TIME_UTC);
138138
uint64_t tsn = timespec_to_nano(ts);
139-
wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }));
139+
uint64_t profiled_gpu_time = 0;
140+
ExtraKernelOptions extra_kernel_options = {
141+
.profiled_gpu_time = &profiled_gpu_time
142+
};
143+
wait_completion(launch_kernel(program, device, "aobench_kernel", WIDTH / BLOCK_SIZE, HEIGHT / BLOCK_SIZE, 1, 1, (void*[]) { &buf_addr }, &extra_kernel_options));
140144
struct timespec tp;
141145
timespec_get(&tp, TIME_UTC);
142146
uint64_t tpn = timespec_to_nano(tp);
143-
info_print("device rendering took %d us\n", (tpn - tsn) / 1000);
147+
info_print("device rendering took %dus (gpu time: %dus)\n", (tpn - tsn) / 1000, profiled_gpu_time / 1000);
144148

145149
debug_print("data %d\n", (int) img[0]);
146150

samples/checkerboard/checkerboard.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ int main(int argc, char **argv)
7373
error("Failed to load checkerboard module");
7474
Program* program = new_program_from_module(runtime, &compiler_config, m);
7575

76-
wait_completion(launch_kernel(program, device, "checkerboard", 16, 16, 1, 1, (void*[]) { &buf_addr }));
76+
wait_completion(launch_kernel(program, device, "checkerboard", 16, 16, 1, 1, (void*[]) { &buf_addr }, NULL));
7777

7878
copy_from_buffer(buf, 0, img, buf_size);
7979
info_print("data %d\n", (int) img[0]);

src/runtime/cuda/cuda_runtime.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,16 @@ static void cuda_device_cleanup(CudaDevice* device) {
2424

2525
bool cuda_command_wait(CudaCommand* command) {
2626
CHECK_CUDA(cuCtxSynchronize(), return false);
27+
if (command->profiled_gpu_time) {
28+
cudaEventSynchronize(command->stop);
29+
float ms;
30+
cudaEventElapsedTime(&ms, command->start, command->stop);
31+
*command->profiled_gpu_time = (uint64_t) ((double) ms * 1000000);
32+
}
2733
return true;
2834
}
2935

30-
CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
36+
CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* options) {
3137
CudaKernel* kernel = shd_cuda_get_specialized_program(device, p, entry_point);
3238

3339
CudaCommand* cmd = calloc(sizeof(CudaCommand), 1);
@@ -36,11 +42,20 @@ CudaCommand* shd_cuda_launch_kernel(CudaDevice* device, Program* p, String entry
3642
.wait_for_completion = (bool(*)(Command*)) cuda_command_wait
3743
}
3844
};
45+
46+
if (options && options->profiled_gpu_time) {
47+
cmd->profiled_gpu_time = options->profiled_gpu_time;
48+
cudaEventCreate(&cmd->start);
49+
cudaEventCreate(&cmd->stop);
50+
cudaEventRecord(cmd->start, 0);
51+
}
52+
3953
ArenaConfig final_config = get_arena_config(get_module_arena(kernel->final_module));
4054
unsigned int gx = final_config.specializations.workgroup_size[0];
4155
unsigned int gy = final_config.specializations.workgroup_size[1];
4256
unsigned int gz = final_config.specializations.workgroup_size[2];
4357
CHECK_CUDA(cuLaunchKernel(kernel->entry_point_function, dimx, dimy, dimz, gx, gy, gz, 0, 0, args, NULL), return NULL);
58+
cudaEventRecord(cmd->stop, 0);
4459
return cmd;
4560
}
4661

@@ -63,7 +78,7 @@ static CudaDevice* create_cuda_device(CudaBackend* b, int ordinal) {
6378
.allocate_buffer = (Buffer* (*)(Device*, size_t)) shd_cuda_allocate_buffer,
6479
.can_import_host_memory = (bool (*)(Device*)) shd_cuda_can_import_host_memory,
6580
.import_host_memory_as_buffer = (Buffer* (*)(Device*, void*, size_t)) shd_cuda_import_host_memory,
66-
.launch_kernel = (Command*(*)(Device*, Program*, String, int, int, int, int, void**)) shd_cuda_launch_kernel,
81+
.launch_kernel = (Command*(*)(Device*, Program*, String, int, int, int, int, void**, ExtraKernelOptions*)) shd_cuda_launch_kernel,
6782
},
6883
.handle = handle,
6984
.specialized_programs = new_dict(SpecProgramKey, CudaKernel*, (HashFn) hash_spec_program_key, (CmpFn) cmp_spec_program_keys),

src/runtime/cuda/cuda_runtime_private.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "../runtime_private.h"
55

66
#include <cuda.h>
7+
#include <cuda_runtime.h>
78
#include <nvrtc.h>
89

910
#define CHECK_NVRTC(x, failure_handler) { nvrtcResult the_result_ = x; if (the_result_ != NVRTC_SUCCESS) { const char* msg = nvrtcGetErrorString(the_result_); error_print(#x " failed (%s)\n", msg); failure_handler; } }
@@ -40,6 +41,9 @@ typedef struct {
4041

4142
typedef struct {
4243
Command base;
44+
45+
uint64_t* profiled_gpu_time;
46+
cudaEvent_t start, stop;
4347
} CudaCommand;
4448

4549
typedef struct {

src/runtime/runtime.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ Device* get_an_device(Runtime* r) {
7474

7575
const char* get_device_name(Device* d) { return d->get_name(d); }
7676

77-
Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
78-
return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args);
77+
Command* launch_kernel(Program* p, Device* d, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* extra_options) {
78+
return d->launch_kernel(d, p, entry_point, dimx, dimy, dimz, args_count, args, extra_options);
7979
}
8080

8181
bool wait_completion(Command* cmd) { return cmd->wait_for_completion(cmd); }

src/runtime/runtime_private.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct Device_ {
3030
void (*cleanup)(Device*);
3131
String (*get_name)(Device*);
3232

33-
Command* (*launch_kernel)(Device*, Program*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
33+
Command* (*launch_kernel)(Device*, Program*, const char* entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
3434
Buffer* (*allocate_buffer)(Device*, size_t bytes);
3535
Buffer* (*import_host_memory_as_buffer)(Device*, void* base, size_t bytes);
3636
bool (*can_import_host_memory)(Device*);

src/runtime/runtime_test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ int main(int argc, char* argv[]) {
7070

7171
int32_t a0 = 42;
7272
uint64_t a1 = get_buffer_device_pointer(buffer);
73-
wait_completion(launch_kernel(program, device, args.driver_config.config.specialization.entry_point ? args.driver_config.config.specialization.entry_point : "my_kernel", 1, 1, 1, 2, (void*[]) { &a0, &a1 }));
73+
wait_completion(launch_kernel(program, device, args.driver_config.config.specialization.entry_point ? args.driver_config.config.specialization.entry_point : "my_kernel", 1, 1, 1, 2, (void*[]) { &a0, &a1 }, NULL));
7474

7575
destroy_buffer(buffer);
7676

src/runtime/vulkan/vk_runtime_dispatch.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ static Command make_command_base() {
5757
};
5858
}
5959

60-
VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args) {
60+
VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions* options) {
6161
assert(program && device);
6262

6363
VkrSpecProgram* prog = get_specialized_program(program, entry_point, device);
@@ -82,8 +82,26 @@ VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_
8282

8383
vkCmdBindPipeline(cmd->cmd_buf, VK_PIPELINE_BIND_POINT_COMPUTE, prog->pipeline);
8484
bind_program_resources(cmd, prog);
85+
86+
if (options && options->profiled_gpu_time) {
87+
VkQueryPoolCreateInfo qpci = {
88+
.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
89+
.pNext = NULL,
90+
.queryType = VK_QUERY_TYPE_TIMESTAMP,
91+
.queryCount = 2,
92+
};
93+
CHECK_VK(vkCreateQueryPool(device->device, &qpci, NULL, &cmd->query_pool), {});
94+
cmd->profiled_gpu_time = options->profiled_gpu_time;
95+
vkCmdResetQueryPool(cmd->cmd_buf, cmd->query_pool, 0, 1);
96+
vkCmdWriteTimestamp(cmd->cmd_buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, cmd->query_pool, 0);
97+
}
98+
8599
vkCmdDispatch(cmd->cmd_buf, dimx, dimy, dimz);
86100

101+
if (options && options->profiled_gpu_time) {
102+
vkCmdWriteTimestamp(cmd->cmd_buf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, cmd->query_pool, 1);
103+
}
104+
87105
if (!vkr_submit_command(cmd))
88106
goto err_post_commands_create;
89107

@@ -153,13 +171,20 @@ bool vkr_submit_command(VkrCommand* cmd) {
153171
bool vkr_wait_completion(VkrCommand* cmd) {
154172
assert(cmd->submitted && "Command must be submitted before they can be waited on");
155173
CHECK_VK(vkWaitForFences(cmd->device->device, 1, (VkFence[]) { cmd->done_fence }, true, UINT32_MAX), return false);
174+
if (cmd->profiled_gpu_time) {
175+
uint64_t ts[2];
176+
CHECK_VK(vkGetQueryPoolResults(cmd->device->device, cmd->query_pool, 0, 2, sizeof(uint64_t) * 2, ts, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT), {});
177+
*cmd->profiled_gpu_time = ts[1] - ts[0];
178+
}
156179
vkr_destroy_command(cmd);
157180
return true;
158181
}
159182

160183
void vkr_destroy_command(VkrCommand* cmd) {
161184
if (cmd->submitted)
162185
vkDestroyFence(cmd->device->device, cmd->done_fence, NULL);
186+
if (cmd->query_pool)
187+
vkDestroyQueryPool(cmd->device->device, cmd->query_pool, NULL);
163188
vkFreeCommandBuffers(cmd->device->device, cmd->device->cmd_pool, 1, &cmd->cmd_buf);
164189
free(cmd);
165190
}

src/runtime/vulkan/vk_runtime_private.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,17 @@ struct VkrCommand_ {
181181
VkCommandBuffer cmd_buf;
182182
VkFence done_fence;
183183
bool submitted;
184+
185+
uint64_t* profiled_gpu_time;
186+
VkQueryPool query_pool;
184187
};
185188

186189
VkrCommand* vkr_begin_command(VkrDevice* device);
187190
bool vkr_submit_command(VkrCommand* commands);
188191
void vkr_destroy_command(VkrCommand* commands);
189192
bool vkr_wait_completion(VkrCommand* cmd);
190193

191-
VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args);
194+
VkrCommand* vkr_launch_kernel(VkrDevice* device, Program* program, String entry_point, int dimx, int dimy, int dimz, int args_count, void** args, ExtraKernelOptions*);
192195

193196
typedef struct ProgramResourceInfo_ ProgramResourceInfo;
194197
struct ProgramResourceInfo_ {

0 commit comments

Comments
 (0)