@@ -297,6 +297,10 @@ static int ggml_backend_opencl_n_devices = 0;
297297struct ProfilingInfo {
298298 std::string op_name;
299299 std::string kernel_name;
300+
301+ cl_kernel kernel;
302+ cl_event evt;
303+
300304 cl_ulong cmd_queued;
301305 cl_ulong cmd_submit;
302306 cl_ulong cmd_start;
@@ -918,6 +922,45 @@ static void ggml_cl2_free(void) {
918922 return ;
919923 }
920924
925+ // Populate profiling info
926+ for (ProfilingInfo & info : g_profiling_info) {
927+ cl_ulong cmd_queued;
928+ cl_ulong cmd_submit;
929+ cl_ulong cmd_start;
930+ cl_ulong cmd_end;
931+ cl_ulong cmd_complete;
932+
933+ CL_CHECK (clWaitForEvents (1 , &info.evt ));
934+ CL_CHECK (clGetEventProfilingInfo (
935+ info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
936+ CL_CHECK (clGetEventProfilingInfo (
937+ info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
938+ CL_CHECK (clGetEventProfilingInfo (
939+ info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
940+ CL_CHECK (clGetEventProfilingInfo (
941+ info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
942+ CL_CHECK (clGetEventProfilingInfo (
943+ info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
944+ CL_CHECK (clReleaseEvent (info.evt ));
945+
946+ char kernel_name[512 ];
947+ CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
948+ sizeof (kernel_name), kernel_name, NULL ));
949+ info.kernel_name = kernel_name;
950+
951+ info.cmd_queued = cmd_queued;
952+ info.cmd_submit = cmd_submit;
953+ info.cmd_start = cmd_start;
954+ info.cmd_end = cmd_end;
955+
956+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958+ info.cmd_duration_ns = cmd_end - cmd_start;
959+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961+ }
962+
963+ // Dump a csv
921964 float total_kernel_time = 0 ;
922965 fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
923966 for (const ProfilingInfo & info : g_profiling_info) {
@@ -937,6 +980,7 @@ static void ggml_cl2_free(void) {
937980
938981 GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " , total_kernel_time);
939982
983+ // Dump a simple chrome trace
940984 FILE* ftrace = fopen (" cl_trace.json" , " w" );
941985 if (!ftrace) {
942986 GGML_LOG_ERROR (" Failed to open cl_trace.json\n " );
@@ -2107,42 +2151,10 @@ static void populateProfilingInfo(
21072151 ProfilingInfo& info, cl_event evt, cl_kernel kernel,
21082152 size_t global_size[3 ], size_t local_size[3 ],
21092153 const ggml_tensor * tensor) {
2110- cl_ulong cmd_queued;
2111- cl_ulong cmd_submit;
2112- cl_ulong cmd_start;
2113- cl_ulong cmd_end;
2114- cl_ulong cmd_complete;
2115-
2116- CL_CHECK (clWaitForEvents (1 , &evt));
2117- CL_CHECK (clGetEventProfilingInfo (
2118- evt, CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
2119- CL_CHECK (clGetEventProfilingInfo (
2120- evt, CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
2121- CL_CHECK (clGetEventProfilingInfo (
2122- evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
2123- CL_CHECK (clGetEventProfilingInfo (
2124- evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
2125- CL_CHECK (clGetEventProfilingInfo (
2126- evt, CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
2127- CL_CHECK (clReleaseEvent (evt));
2128-
2129- char kernel_name[512 ];
2130- CL_CHECK (clGetKernelInfo (kernel, CL_KERNEL_FUNCTION_NAME,
2131- sizeof (kernel_name), kernel_name, NULL ));
2132-
2133- info.cmd_queued = cmd_queued;
2134- info.cmd_submit = cmd_submit;
2135- info.cmd_start = cmd_start;
2136- info.cmd_end = cmd_end;
2137-
2138- info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
2139- info.cmd_submit_duration_ns = cmd_start - cmd_submit;
2140- info.cmd_duration_ns = cmd_end - cmd_start;
2141- info.cmd_complete_duration_ns = cmd_complete - cmd_end;
2142- info.cmd_total_duration_ns = cmd_complete - cmd_queued;
2143-
2144- info.op_name = tensor->name ;
2145- info.kernel_name = kernel_name;
2154+ info.op_name = tensor->name ;
2155+ info.kernel = kernel;
2156+ info.evt = evt;
2157+
21462158 info.local_size [0 ] = local_size[0 ];
21472159 info.local_size [1 ] = local_size[1 ];
21482160 info.local_size [2 ] = local_size[2 ];
0 commit comments