From d9c8bb98401b676baea1b0eb149d5a06851be98c Mon Sep 17 00:00:00 2001 From: Li He Date: Thu, 6 Mar 2025 22:44:34 -0800 Subject: [PATCH 1/4] opencl: more profiling timing --- ggml/src/ggml-opencl/ggml-opencl.cpp | 54 +++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 14d9934fb1b..3d27453d115 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -297,8 +297,17 @@ static int ggml_backend_opencl_n_devices = 0; struct ProfilingInfo { std::string op_name; std::string kernel_name; - // Kernel execution time in nanoseconds. - cl_ulong duration_ns; + // For the times below, see spec for clGetEventProfilingInfo + // The time kernel spent in cmd queue - SUBMIT - QUEUED + cl_ulong cmd_queued_duration_ns; + // The time kernel spent for submission - START - SUBMIT + cl_ulong cmd_submit_duration_ns; + // Kernel execution time in nanoseconds - END - START + cl_ulong cmd_duration_ns; + // The time for the kernel to complete - COMPLETE - END + cl_ulong cmd_complete_duration_ns; + // Total time to finish the kernel - COMPELTE - QUEUED + cl_ulong cmd_total_duration_ns; // Global and local work sizes. size_t global_size[3]; size_t local_size[3]; @@ -904,11 +913,16 @@ static void ggml_cl2_free(void) { } float total_kernel_time = 0; - fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); + fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); for (const ProfilingInfo & info : g_profiling_info) { - total_kernel_time += info.duration_ns/1.e6f; - fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", - info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, + total_kernel_time += info.cmd_duration_ns/1.e6f; + fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", + info.op_name.c_str(), info.kernel_name.c_str(), + info.cmd_queued_duration_ns/1.e6f, + info.cmd_submit_duration_ns/1.e6f, + info.cmd_duration_ns/1.e6f, + info.cmd_complete_duration_ns/1.e6f, + info.cmd_total_duration_ns/1.e6f, info.global_size[0], info.global_size[1], info.global_size[2], info.local_size[0], info.local_size[2], info.local_size[2], info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); @@ -2062,23 +2076,39 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso // Profiling utility //------------------------------------------------------------------------------ #ifdef GGML_OPENCL_PROFILING -void populateProfilingInfo( +static void populateProfilingInfo( ProfilingInfo& info, cl_event evt, cl_kernel kernel, size_t global_size[3], size_t local_size[3], const ggml_tensor * tensor) { - cl_ulong start; - cl_ulong end; + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + CL_CHECK(clWaitForEvents(1, &evt)); CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); + evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); + evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(evt)); char kernel_name[512]; CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernel_name), kernel_name, NULL)); - info.duration_ns = end - start; + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + info.op_name = tensor->name; info.kernel_name = kernel_name; info.local_size[0] = local_size[0]; From cac4560cf42189af390619a2a77c37a92923be9e Mon Sep 17 00:00:00 2001 From: Li He Date: Mon, 10 Mar 2025 22:45:38 -0700 Subject: [PATCH 2/4] opencl: generate trace for profiling --- ggml/src/ggml-opencl/ggml-opencl.cpp | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 3d27453d115..535a4e62d56 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -297,6 +297,12 @@ static int ggml_backend_opencl_n_devices = 0; struct ProfilingInfo { std::string op_name; std::string kernel_name; + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong overhead_start; + cl_ulong overhead_end; // For the times below, see spec for clGetEventProfilingInfo // The time kernel spent in cmd queue - SUBMIT - QUEUED cl_ulong cmd_queued_duration_ns; @@ -930,6 +936,27 @@ static void ggml_cl2_free(void) { fclose(fperf); GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + + FILE* ftrace = fopen("cl_trace.json", "w"); + if (!ftrace) { + GGML_LOG_ERROR("Failed to open cl_trace.json\n"); + return; + } + + fprintf(ftrace, "[\n"); + for (const ProfilingInfo & info : g_profiling_info) { + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_queued/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_submit/1000); + + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_start/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_end/1000); + } + fprintf(ftrace, "]\n"); + fclose(ftrace); #endif } @@ -2103,6 +2130,11 @@ static void populateProfilingInfo( CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernel_name), kernel_name, NULL)); + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; info.cmd_submit_duration_ns = cmd_start - cmd_submit; info.cmd_duration_ns = cmd_end - cmd_start; From a4c65ebb2bc7db60df73ab23a01ebc50f3917c86 Mon Sep 17 00:00:00 2001 From: Li He Date: Tue, 11 Mar 2025 15:38:09 -0700 Subject: [PATCH 3/4] opencl: reduce profiling overhead * Populate profiling timing info at the end rather than after each kernel run --- ggml/src/ggml-opencl/ggml-opencl.cpp | 84 ++++++++++++++++------------ 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 535a4e62d56..71256f01db1 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -297,6 +297,10 @@ static int ggml_backend_opencl_n_devices = 0; struct ProfilingInfo { std::string op_name; std::string kernel_name; + + cl_kernel kernel; + cl_event evt; + cl_ulong cmd_queued; cl_ulong cmd_submit; cl_ulong cmd_start; @@ -918,6 +922,45 @@ static void ggml_cl2_free(void) { return; } + // Populate profiling info + for (ProfilingInfo & info : g_profiling_info) { + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + + CL_CHECK(clWaitForEvents(1, &info.evt)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(info.evt)); + + char kernel_name[512]; + CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, + sizeof(kernel_name), kernel_name, NULL)); + info.kernel_name = kernel_name; + + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + } + + // Dump a csv float total_kernel_time = 0; fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); for (const ProfilingInfo & info : g_profiling_info) { @@ -937,6 +980,7 @@ static void ggml_cl2_free(void) { GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + // Dump a simple chrome trace FILE* ftrace = fopen("cl_trace.json", "w"); if (!ftrace) { GGML_LOG_ERROR("Failed to open cl_trace.json\n"); @@ -2107,42 +2151,10 @@ static void populateProfilingInfo( ProfilingInfo& info, cl_event evt, cl_kernel kernel, size_t global_size[3], size_t local_size[3], const ggml_tensor * tensor) { - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong cmd_complete; - - CL_CHECK(clWaitForEvents(1, &evt)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); - CL_CHECK(clGetEventProfilingInfo( - evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); - CL_CHECK(clReleaseEvent(evt)); - - char kernel_name[512]; - CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, - sizeof(kernel_name), kernel_name, NULL)); - - info.cmd_queued = cmd_queued; - info.cmd_submit = cmd_submit; - info.cmd_start = cmd_start; - info.cmd_end = cmd_end; - - info.cmd_queued_duration_ns = cmd_submit - cmd_queued; - info.cmd_submit_duration_ns = cmd_start - cmd_submit; - info.cmd_duration_ns = cmd_end - cmd_start; - info.cmd_complete_duration_ns = cmd_complete - cmd_end; - info.cmd_total_duration_ns = cmd_complete - cmd_queued; - - info.op_name = tensor->name; - info.kernel_name = kernel_name; + info.op_name = tensor->name; + info.kernel = kernel; + info.evt = evt; + info.local_size[0] = local_size[0]; info.local_size[1] = local_size[1]; info.local_size[2] = local_size[2]; From b4fce83f51275dc52ab9fd3894562e097eebbec8 Mon Sep 17 00:00:00 2001 From: Li He Date: Fri, 14 Mar 2025 14:04:23 -0700 Subject: [PATCH 4/4] opencl: fix for chrome tracing --- ggml/src/ggml-opencl/ggml-opencl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 71256f01db1..efaf7f47907 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -999,7 +999,6 @@ static void ggml_cl2_free(void) { fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", info.kernel_name.c_str(), info.cmd_end/1000); } - fprintf(ftrace, "]\n"); fclose(ftrace); #endif }