Skip to content

Commit 3f52740

Browse files
lhezMinh141120
authored andcommitted
opencl: ref count ggml_backend_opencl_context and refactor profiling (ggml-org#14254)
* Move profiling info into `ggml_backend_opencl_context` * Add `enqueue_ndrange_kernel` to launch kernel
1 parent f3bca29 commit 3f52740

File tree

1 file changed

+104
-104
lines changed

1 file changed

+104
-104
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 104 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,108 @@ struct ggml_backend_opencl_context {
439439
cl_kernel kernel_timestep_embedding;
440440
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
441441

442+
std::vector<ProfilingInfo> profiling_info;
443+
444+
void write_profiling_info() {
445+
FILE * fperf = fopen("cl_profiling.csv", "w");
446+
if (!fperf) {
447+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
448+
return;
449+
}
450+
451+
// Populate profiling info
452+
for (ProfilingInfo & info : profiling_info) {
453+
cl_ulong cmd_queued;
454+
cl_ulong cmd_submit;
455+
cl_ulong cmd_start;
456+
cl_ulong cmd_end;
457+
cl_ulong cmd_complete;
458+
459+
CL_CHECK(clWaitForEvents(1, &info.evt));
460+
CL_CHECK(clGetEventProfilingInfo(
461+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
462+
CL_CHECK(clGetEventProfilingInfo(
463+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
464+
CL_CHECK(clGetEventProfilingInfo(
465+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
466+
CL_CHECK(clGetEventProfilingInfo(
467+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
468+
CL_CHECK(clGetEventProfilingInfo(
469+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
470+
CL_CHECK(clReleaseEvent(info.evt));
471+
472+
char kernel_name[512];
473+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
474+
sizeof(kernel_name), kernel_name, NULL));
475+
info.kernel_name = kernel_name;
476+
477+
info.cmd_queued = cmd_queued;
478+
info.cmd_submit = cmd_submit;
479+
info.cmd_start = cmd_start;
480+
info.cmd_end = cmd_end;
481+
482+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
483+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
484+
info.cmd_duration_ns = cmd_end - cmd_start;
485+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
486+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
487+
}
488+
489+
// Dump a csv
490+
float total_kernel_time = 0;
491+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
492+
for (const ProfilingInfo & info : profiling_info) {
493+
total_kernel_time += info.cmd_duration_ns/1.e6f;
494+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
495+
info.op_name.c_str(), info.kernel_name.c_str(),
496+
info.cmd_queued_duration_ns/1.e6f,
497+
info.cmd_submit_duration_ns/1.e6f,
498+
info.cmd_duration_ns/1.e6f,
499+
info.cmd_complete_duration_ns/1.e6f,
500+
info.cmd_total_duration_ns/1.e6f,
501+
info.global_size[0], info.global_size[1], info.global_size[2],
502+
info.local_size[0], info.local_size[1], info.local_size[2],
503+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
504+
}
505+
fclose(fperf);
506+
507+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
508+
509+
// Dump a simple chrome trace
510+
FILE* ftrace = fopen("cl_trace.json", "w");
511+
if (!ftrace) {
512+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
513+
return;
514+
}
515+
516+
fprintf(ftrace, "[\n");
517+
for (const ProfilingInfo & info : profiling_info) {
518+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
519+
info.kernel_name.c_str(), info.cmd_queued/1000);
520+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
521+
info.kernel_name.c_str(), info.cmd_submit/1000);
522+
523+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
524+
info.kernel_name.c_str(), info.cmd_start/1000);
525+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
526+
info.kernel_name.c_str(), info.cmd_end/1000);
527+
}
528+
fclose(ftrace);
529+
}
530+
531+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
532+
#ifdef GGML_OPENCL_PROFILING
533+
cl_event evt;
534+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
535+
536+
profiling_info.emplace_back();
537+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
538+
#else
539+
GGML_UNUSED(tensor);
540+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
541+
#endif
542+
}
543+
442544
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
443545
// Transpose kernels
444546
cl_program program_transpose;
@@ -5282,15 +5384,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
52825384
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
52835385
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
52845386

5285-
#ifdef GGML_OPENCL_PROFILING
5286-
cl_event evt;
5287-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5288-
5289-
g_profiling_info.emplace_back();
5290-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5291-
#else
5292-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5293-
#endif
5387+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
52945388
}
52955389
}
52965390

@@ -5307,7 +5401,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
53075401
GGML_ASSERT(src2->extra);
53085402

53095403
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5310-
cl_command_queue queue = backend_ctx->queue;
53115404

53125405
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
53135406
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
@@ -5413,15 +5506,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
54135506
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
54145507
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
54155508

5416-
#ifdef GGML_OPENCL_PROFILING
5417-
cl_event evt;
5418-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5419-
5420-
g_profiling_info.emplace_back();
5421-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5422-
#else
5423-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5424-
#endif
5509+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
54255510
}
54265511

54275512
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6091,91 +6176,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
60916176
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
60926177
}
60936178

6094-
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6095-
GGML_ASSERT(src0);
6096-
GGML_ASSERT(src0->extra);
6097-
GGML_ASSERT(dst);
6098-
GGML_ASSERT(dst->extra);
6099-
6100-
GGML_ASSERT(ggml_is_contiguous_1(src0));
6101-
6102-
if (src1) {
6103-
GGML_ASSERT(src1);
6104-
GGML_ASSERT(src1->extra);
6105-
GGML_ASSERT(ggml_are_same_shape(src0, src1));
6106-
}
6107-
6108-
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6109-
6110-
cl_kernel kernel;
6111-
switch (ggml_get_glu_op(dst)) {
6112-
case GGML_GLU_OP_GEGLU:
6113-
if (dst->type == GGML_TYPE_F32) {
6114-
kernel = backend_ctx->kernel_geglu;
6115-
} else {
6116-
kernel = backend_ctx->kernel_geglu_f16;
6117-
}
6118-
break;
6119-
case GGML_GLU_OP_REGLU:
6120-
if (dst->type == GGML_TYPE_F32) {
6121-
kernel = backend_ctx->kernel_reglu;
6122-
} else {
6123-
kernel = backend_ctx->kernel_reglu_f16;
6124-
}
6125-
break;
6126-
case GGML_GLU_OP_SWIGLU:
6127-
if (dst->type == GGML_TYPE_F32) {
6128-
kernel = backend_ctx->kernel_swiglu;
6129-
} else {
6130-
kernel = backend_ctx->kernel_swiglu_f16;
6131-
}
6132-
break;
6133-
default:
6134-
GGML_ABORT("Unsupported glu op");
6135-
}
6136-
6137-
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6138-
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6139-
6140-
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
6141-
6142-
cl_ulong offset0 = extra0->offset + src0->view_offs;
6143-
cl_ulong offsetd = extrad->offset + dst->view_offs;
6144-
6145-
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
6146-
6147-
const int ne0 = dst->ne[0];
6148-
6149-
const cl_ulong nb01 = src0->nb[1];
6150-
const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
6151-
6152-
const cl_ulong nb1 = dst->nb[1];
6153-
6154-
const int swp = ((const int32_t *) dst->op_params)[1];
6155-
const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
6156-
const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
6157-
6158-
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6159-
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6160-
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
6161-
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6162-
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6163-
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6164-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
6165-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
6166-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
6167-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
6168-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
6169-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
6170-
6171-
const size_t nrows = ggml_nrows(src0);
6172-
size_t nth = 512;
6173-
size_t global_work_size[] = {nrows*nth, 1, 1};
6174-
size_t local_work_size[] = {nth, 1, 1};
6175-
6176-
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6177-
}
6178-
61796179
//------------------------------------------------------------------------------
61806180
// Op offloading
61816181
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)