@@ -439,6 +439,108 @@ struct ggml_backend_opencl_context {
439439 cl_kernel kernel_timestep_embedding;
440440 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
441441
442+ std::vector<ProfilingInfo> profiling_info;
443+
444+ void write_profiling_info () {
445+ FILE * fperf = fopen (" cl_profiling.csv" , " w" );
446+ if (!fperf) {
447+ GGML_LOG_ERROR (" Failed to open cl_profiling.csv\n " );
448+ return ;
449+ }
450+
451+ // Populate profiling info
452+ for (ProfilingInfo & info : profiling_info) {
453+ cl_ulong cmd_queued;
454+ cl_ulong cmd_submit;
455+ cl_ulong cmd_start;
456+ cl_ulong cmd_end;
457+ cl_ulong cmd_complete;
458+
459+ CL_CHECK (clWaitForEvents (1 , &info.evt ));
460+ CL_CHECK (clGetEventProfilingInfo (
461+ info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
462+ CL_CHECK (clGetEventProfilingInfo (
463+ info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
464+ CL_CHECK (clGetEventProfilingInfo (
465+ info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
466+ CL_CHECK (clGetEventProfilingInfo (
467+ info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
468+ CL_CHECK (clGetEventProfilingInfo (
469+ info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
470+ CL_CHECK (clReleaseEvent (info.evt ));
471+
472+ char kernel_name[512 ];
473+ CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
474+ sizeof (kernel_name), kernel_name, NULL ));
475+ info.kernel_name = kernel_name;
476+
477+ info.cmd_queued = cmd_queued;
478+ info.cmd_submit = cmd_submit;
479+ info.cmd_start = cmd_start;
480+ info.cmd_end = cmd_end;
481+
482+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
483+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
484+ info.cmd_duration_ns = cmd_end - cmd_start;
485+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
486+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
487+ }
488+
489+ // Dump a csv
490+ float total_kernel_time = 0 ;
491+ fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n " );
492+ for (const ProfilingInfo & info : profiling_info) {
493+ total_kernel_time += info.cmd_duration_ns /1 .e6f ;
494+ fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " ,
495+ info.op_name .c_str (), info.kernel_name .c_str (),
496+ info.cmd_queued_duration_ns /1 .e6f ,
497+ info.cmd_submit_duration_ns /1 .e6f ,
498+ info.cmd_duration_ns /1 .e6f ,
499+ info.cmd_complete_duration_ns /1 .e6f ,
500+ info.cmd_total_duration_ns /1 .e6f ,
501+ info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
502+ info.local_size [0 ], info.local_size [1 ], info.local_size [2 ],
503+ info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
504+ }
505+ fclose (fperf);
506+
507+ GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " , total_kernel_time);
508+
509+ // Dump a simple chrome trace
510+ FILE* ftrace = fopen (" cl_trace.json" , " w" );
511+ if (!ftrace) {
512+ GGML_LOG_ERROR (" Failed to open cl_trace.json\n " );
513+ return ;
514+ }
515+
516+ fprintf (ftrace, " [\n " );
517+ for (const ProfilingInfo & info : profiling_info) {
518+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
519+ info.kernel_name .c_str (), info.cmd_queued /1000 );
520+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " ,
521+ info.kernel_name .c_str (), info.cmd_submit /1000 );
522+
523+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
524+ info.kernel_name .c_str (), info.cmd_start /1000 );
525+ fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " ,
526+ info.kernel_name .c_str (), info.cmd_end /1000 );
527+ }
528+ fclose (ftrace);
529+ }
530+
531+ void enqueue_ndrange_kernel (cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
532+ #ifdef GGML_OPENCL_PROFILING
533+ cl_event evt;
534+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, work_dim, NULL , global_work_size, local_work_size, 0 , NULL , &evt));
535+
536+ profiling_info.emplace_back ();
537+ populateProfilingInfo (profiling_info.back (), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
538+ #else
539+ GGML_UNUSED (tensor);
540+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, work_dim, NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
541+ #endif
542+ }
543+
442544#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
443545 // Transpose kernels
444546 cl_program program_transpose;
@@ -5282,15 +5384,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
52825384 size_t global_work_size[] = {(size_t )ne01*nth0, (size_t )ny*nth1, (size_t )ne12*ne13};
52835385 size_t local_work_size[] = {(size_t )nth0, (size_t )nth1, 1 };
52845386
5285- #ifdef GGML_OPENCL_PROFILING
5286- cl_event evt;
5287- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
5288-
5289- g_profiling_info.emplace_back ();
5290- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
5291- #else
5292- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
5293- #endif
5387+ backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
52945388 }
52955389}
52965390
@@ -5307,7 +5401,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
53075401 GGML_ASSERT (src2->extra );
53085402
53095403 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
5310- cl_command_queue queue = backend_ctx->queue ;
53115404
53125405 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra ;
53135406 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra ;
@@ -5413,15 +5506,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
54135506 size_t global_work_size[] = {(size_t )(ne01+ndst*nsg-1 )/(ndst*nsg)*sgs, (size_t )(_ne1+nrows-1 )/nrows*nsg, (size_t )ne123};
54145507 size_t local_work_size[] = {(size_t )sgs, (size_t )nsg, 1 };
54155508
5416- #ifdef GGML_OPENCL_PROFILING
5417- cl_event evt;
5418- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
5419-
5420- g_profiling_info.emplace_back ();
5421- populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
5422- #else
5423- CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
5424- #endif
5509+ backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
54255510}
54265511
54275512static void ggml_cl_scale (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -6091,91 +6176,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
60916176 backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
60926177}
60936178
6094- static void ggml_cl_glu (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6095- GGML_ASSERT (src0);
6096- GGML_ASSERT (src0->extra );
6097- GGML_ASSERT (dst);
6098- GGML_ASSERT (dst->extra );
6099-
6100- GGML_ASSERT (ggml_is_contiguous_1 (src0));
6101-
6102- if (src1) {
6103- GGML_ASSERT (src1);
6104- GGML_ASSERT (src1->extra );
6105- GGML_ASSERT (ggml_are_same_shape (src0, src1));
6106- }
6107-
6108- ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
6109-
6110- cl_kernel kernel;
6111- switch (ggml_get_glu_op (dst)) {
6112- case GGML_GLU_OP_GEGLU:
6113- if (dst->type == GGML_TYPE_F32) {
6114- kernel = backend_ctx->kernel_geglu ;
6115- } else {
6116- kernel = backend_ctx->kernel_geglu_f16 ;
6117- }
6118- break ;
6119- case GGML_GLU_OP_REGLU:
6120- if (dst->type == GGML_TYPE_F32) {
6121- kernel = backend_ctx->kernel_reglu ;
6122- } else {
6123- kernel = backend_ctx->kernel_reglu_f16 ;
6124- }
6125- break ;
6126- case GGML_GLU_OP_SWIGLU:
6127- if (dst->type == GGML_TYPE_F32) {
6128- kernel = backend_ctx->kernel_swiglu ;
6129- } else {
6130- kernel = backend_ctx->kernel_swiglu_f16 ;
6131- }
6132- break ;
6133- default :
6134- GGML_ABORT (" Unsupported glu op" );
6135- }
6136-
6137- ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
6138- ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
6139-
6140- ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr ;
6141-
6142- cl_ulong offset0 = extra0->offset + src0->view_offs ;
6143- cl_ulong offsetd = extrad->offset + dst->view_offs ;
6144-
6145- cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
6146-
6147- const int ne0 = dst->ne [0 ];
6148-
6149- const cl_ulong nb01 = src0->nb [1 ];
6150- const cl_ulong nb11 = src1 ? src1->nb [1 ] : nb01;
6151-
6152- const cl_ulong nb1 = dst->nb [1 ];
6153-
6154- const int swp = ((const int32_t *) dst->op_params )[1 ];
6155- const int ne00_off = src1 ? 0 : (swp ? ne0 : 0 );
6156- const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
6157-
6158- CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
6159- CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
6160- CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), src1 ? &extra1->data_device : &extra0->data_device ));
6161- CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offset1));
6162- CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (cl_mem), &extrad->data_device ));
6163- CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (cl_ulong), &offsetd));
6164- CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (cl_ulong), &nb01));
6165- CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb11));
6166- CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (int ), &ne0));
6167- CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb1));
6168- CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne00_off));
6169- CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne10_off));
6170-
6171- const size_t nrows = ggml_nrows (src0);
6172- size_t nth = 512 ;
6173- size_t global_work_size[] = {nrows*nth, 1 , 1 };
6174- size_t local_work_size[] = {nth, 1 , 1 };
6175-
6176- backend_ctx->enqueue_ndrange_kernel (kernel, 3 , global_work_size, local_work_size, dst);
6177- }
6178-
61796179// ------------------------------------------------------------------------------
61806180// Op offloading
61816181// ------------------------------------------------------------------------------
0 commit comments