Skip to content

Commit d9c8bb9

Browse files
committed
opencl: more profiling timing
1 parent f4c3dd5 commit d9c8bb9

File tree

1 file changed

+42
-12
lines changed

1 file changed

+42
-12
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,17 @@ static int ggml_backend_opencl_n_devices = 0;
297297
struct ProfilingInfo {
298298
std::string op_name;
299299
std::string kernel_name;
300-
// Kernel execution time in nanoseconds.
301-
cl_ulong duration_ns;
300+
// For the times below, see spec for clGetEventProfilingInfo
301+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
302+
cl_ulong cmd_queued_duration_ns;
303+
// The time kernel spent for submission - START - SUBMIT
304+
cl_ulong cmd_submit_duration_ns;
305+
// Kernel execution time in nanoseconds - END - START
306+
cl_ulong cmd_duration_ns;
307+
// The time for the kernel to complete - COMPLETE - END
308+
cl_ulong cmd_complete_duration_ns;
309+
// Total time to finish the kernel - COMPELTE - QUEUED
310+
cl_ulong cmd_total_duration_ns;
302311
// Global and local work sizes.
303312
size_t global_size[3];
304313
size_t local_size[3];
@@ -904,11 +913,16 @@ static void ggml_cl2_free(void) {
904913
}
905914

906915
float total_kernel_time = 0;
907-
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
916+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
908917
for (const ProfilingInfo & info : g_profiling_info) {
909-
total_kernel_time += info.duration_ns/1.e6f;
910-
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
911-
info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
918+
total_kernel_time += info.cmd_duration_ns/1.e6f;
919+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
920+
info.op_name.c_str(), info.kernel_name.c_str(),
921+
info.cmd_queued_duration_ns/1.e6f,
922+
info.cmd_submit_duration_ns/1.e6f,
923+
info.cmd_duration_ns/1.e6f,
924+
info.cmd_complete_duration_ns/1.e6f,
925+
info.cmd_total_duration_ns/1.e6f,
912926
info.global_size[0], info.global_size[1], info.global_size[2],
913927
info.local_size[0], info.local_size[2], info.local_size[2],
914928
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
@@ -2062,23 +2076,39 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622076
// Profiling utility
20632077
//------------------------------------------------------------------------------
20642078
#ifdef GGML_OPENCL_PROFILING
2065-
void populateProfilingInfo(
2079+
static void populateProfilingInfo(
20662080
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672081
size_t global_size[3], size_t local_size[3],
20682082
const ggml_tensor * tensor) {
2069-
cl_ulong start;
2070-
cl_ulong end;
2083+
cl_ulong cmd_queued;
2084+
cl_ulong cmd_submit;
2085+
cl_ulong cmd_start;
2086+
cl_ulong cmd_end;
2087+
cl_ulong cmd_complete;
2088+
20712089
CL_CHECK(clWaitForEvents(1, &evt));
20722090
CL_CHECK(clGetEventProfilingInfo(
2073-
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
2091+
evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
2092+
CL_CHECK(clGetEventProfilingInfo(
2093+
evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
2094+
CL_CHECK(clGetEventProfilingInfo(
2095+
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
20742096
CL_CHECK(clGetEventProfilingInfo(
2075-
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
2097+
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
2098+
CL_CHECK(clGetEventProfilingInfo(
2099+
evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
2100+
CL_CHECK(clReleaseEvent(evt));
20762101

20772102
char kernel_name[512];
20782103
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
20792104
sizeof(kernel_name), kernel_name, NULL));
20802105

2081-
info.duration_ns = end - start;
2106+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
2107+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
2108+
info.cmd_duration_ns = cmd_end - cmd_start;
2109+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
2110+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
2111+
20822112
info.op_name = tensor->name;
20832113
info.kernel_name = kernel_name;
20842114
info.local_size[0] = local_size[0];

0 commit comments

Comments
 (0)