@@ -297,8 +297,17 @@ static int ggml_backend_opencl_n_devices = 0;
297297struct ProfilingInfo {
298298 std::string op_name;
299299 std::string kernel_name;
300- // Kernel execution time in nanoseconds.
301- cl_ulong duration_ns;
300+ // For the times below, see spec for clGetEventProfilingInfo
301+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
302+ cl_ulong cmd_queued_duration_ns;
303+ // The time kernel spent for submission - START - SUBMIT
304+ cl_ulong cmd_submit_duration_ns;
305+ // Kernel execution time in nanoseconds - END - START
306+ cl_ulong cmd_duration_ns;
307+ // The time for the kernel to complete - COMPLETE - END
308+ cl_ulong cmd_complete_duration_ns;
309+ // Total time to finish the kernel - COMPELTE - QUEUED
310+ cl_ulong cmd_total_duration_ns;
302311 // Global and local work sizes.
303312 size_t global_size[3 ];
304313 size_t local_size[3 ];
@@ -904,11 +913,16 @@ static void ggml_cl2_free(void) {
904913 }
905914
906915 float total_kernel_time = 0 ;
907- fprintf (fperf, " op name, kernel name, duration (ms), global size, local size, output size\n " );
916+ fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
908917 for (const ProfilingInfo & info : g_profiling_info) {
909- total_kernel_time += info.duration_ns /1 .e6f ;
910- fprintf (fperf, " %s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
911- info.op_name .c_str (), info.kernel_name .c_str (), info.duration_ns /1 .e6f ,
918+ total_kernel_time += info.cmd_duration_ns /1 .e6f ;
919+ fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
920+ info.op_name .c_str (), info.kernel_name .c_str (),
921+ info.cmd_queued_duration_ns /1 .e6f ,
922+ info.cmd_submit_duration_ns /1 .e6f ,
923+ info.cmd_duration_ns /1 .e6f ,
924+ info.cmd_complete_duration_ns /1 .e6f ,
925+ info.cmd_total_duration_ns /1 .e6f ,
912926 info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
913927 info.local_size [0 ], info.local_size [2 ], info.local_size [2 ],
914928 info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
@@ -2062,23 +2076,39 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622076// Profiling utility
20632077// ------------------------------------------------------------------------------
20642078#ifdef GGML_OPENCL_PROFILING
2065- void populateProfilingInfo (
2079+ static void populateProfilingInfo (
20662080 ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672081 size_t global_size[3 ], size_t local_size[3 ],
20682082 const ggml_tensor * tensor) {
2069- cl_ulong start;
2070- cl_ulong end;
2083+ cl_ulong cmd_queued;
2084+ cl_ulong cmd_submit;
2085+ cl_ulong cmd_start;
2086+ cl_ulong cmd_end;
2087+ cl_ulong cmd_complete;
2088+
20712089 CL_CHECK (clWaitForEvents (1 , &evt));
20722090 CL_CHECK (clGetEventProfilingInfo (
2073- evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL ));
2091+ evt, CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
2092+ CL_CHECK (clGetEventProfilingInfo (
2093+ evt, CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
2094+ CL_CHECK (clGetEventProfilingInfo (
2095+ evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
20742096 CL_CHECK (clGetEventProfilingInfo (
2075- evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL ));
2097+ evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
2098+ CL_CHECK (clGetEventProfilingInfo (
2099+ evt, CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
2100+ CL_CHECK (clReleaseEvent (evt));
20762101
20772102 char kernel_name[512 ];
20782103 CL_CHECK (clGetKernelInfo (kernel, CL_KERNEL_FUNCTION_NAME,
20792104 sizeof (kernel_name), kernel_name, NULL ));
20802105
2081- info.duration_ns = end - start;
2106+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
2107+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
2108+ info.cmd_duration_ns = cmd_end - cmd_start;
2109+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
2110+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
2111+
20822112 info.op_name = tensor->name ;
20832113 info.kernel_name = kernel_name;
20842114 info.local_size [0 ] = local_size[0 ];
0 commit comments