Skip to content

Commit a4c65eb

Browse files
committed
opencl: reduce profiling overhead
* Populate profiling timing info at the end rather than after each kernel run
1 parent cac4560 commit a4c65eb

File tree

1 file changed

+48
-36
lines changed

1 file changed

+48
-36
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,10 @@ static int ggml_backend_opencl_n_devices = 0;
297297
struct ProfilingInfo {
298298
std::string op_name;
299299
std::string kernel_name;
300+
301+
cl_kernel kernel;
302+
cl_event evt;
303+
300304
cl_ulong cmd_queued;
301305
cl_ulong cmd_submit;
302306
cl_ulong cmd_start;
@@ -918,6 +922,45 @@ static void ggml_cl2_free(void) {
918922
return;
919923
}
920924

925+
// Populate profiling info
926+
for (ProfilingInfo & info : g_profiling_info) {
927+
cl_ulong cmd_queued;
928+
cl_ulong cmd_submit;
929+
cl_ulong cmd_start;
930+
cl_ulong cmd_end;
931+
cl_ulong cmd_complete;
932+
933+
CL_CHECK(clWaitForEvents(1, &info.evt));
934+
CL_CHECK(clGetEventProfilingInfo(
935+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
936+
CL_CHECK(clGetEventProfilingInfo(
937+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
938+
CL_CHECK(clGetEventProfilingInfo(
939+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
940+
CL_CHECK(clGetEventProfilingInfo(
941+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
942+
CL_CHECK(clGetEventProfilingInfo(
943+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
944+
CL_CHECK(clReleaseEvent(info.evt));
945+
946+
char kernel_name[512];
947+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
948+
sizeof(kernel_name), kernel_name, NULL));
949+
info.kernel_name = kernel_name;
950+
951+
info.cmd_queued = cmd_queued;
952+
info.cmd_submit = cmd_submit;
953+
info.cmd_start = cmd_start;
954+
info.cmd_end = cmd_end;
955+
956+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958+
info.cmd_duration_ns = cmd_end - cmd_start;
959+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961+
}
962+
963+
// Dump a csv
921964
float total_kernel_time = 0;
922965
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
923966
for (const ProfilingInfo & info : g_profiling_info) {
@@ -937,6 +980,7 @@ static void ggml_cl2_free(void) {
937980

938981
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
939982

983+
// Dump a simple chrome trace
940984
FILE* ftrace = fopen("cl_trace.json", "w");
941985
if (!ftrace) {
942986
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
@@ -2107,42 +2151,10 @@ static void populateProfilingInfo(
21072151
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
21082152
size_t global_size[3], size_t local_size[3],
21092153
const ggml_tensor * tensor) {
2110-
cl_ulong cmd_queued;
2111-
cl_ulong cmd_submit;
2112-
cl_ulong cmd_start;
2113-
cl_ulong cmd_end;
2114-
cl_ulong cmd_complete;
2115-
2116-
CL_CHECK(clWaitForEvents(1, &evt));
2117-
CL_CHECK(clGetEventProfilingInfo(
2118-
evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
2119-
CL_CHECK(clGetEventProfilingInfo(
2120-
evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
2121-
CL_CHECK(clGetEventProfilingInfo(
2122-
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
2123-
CL_CHECK(clGetEventProfilingInfo(
2124-
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
2125-
CL_CHECK(clGetEventProfilingInfo(
2126-
evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
2127-
CL_CHECK(clReleaseEvent(evt));
2128-
2129-
char kernel_name[512];
2130-
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
2131-
sizeof(kernel_name), kernel_name, NULL));
2132-
2133-
info.cmd_queued = cmd_queued;
2134-
info.cmd_submit = cmd_submit;
2135-
info.cmd_start = cmd_start;
2136-
info.cmd_end = cmd_end;
2137-
2138-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
2139-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
2140-
info.cmd_duration_ns = cmd_end - cmd_start;
2141-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
2142-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
2143-
2144-
info.op_name = tensor->name;
2145-
info.kernel_name = kernel_name;
2154+
info.op_name = tensor->name;
2155+
info.kernel = kernel;
2156+
info.evt = evt;
2157+
21462158
info.local_size[0] = local_size[0];
21472159
info.local_size[1] = local_size[1];
21482160
info.local_size[2] = local_size[2];

0 commit comments

Comments
 (0)