@@ -304,6 +304,7 @@ struct ggml_backend_opencl_context {
304304 cl_program program_norm;
305305 cl_program program_relu;
306306 cl_program program_rms_norm;
307+ cl_program program_group_norm;
307308 cl_program program_rope;
308309 cl_program program_scale;
309310 cl_program program_silu;
@@ -328,6 +329,7 @@ struct ggml_backend_opencl_context {
328329 cl_kernel kernel_clamp;
329330 cl_kernel kernel_norm;
330331 cl_kernel kernel_rms_norm;
332+ cl_kernel kernel_group_norm;
331333 cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
332334 cl_kernel kernel_soft_max, kernel_soft_max_4;
333335 cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
@@ -1079,6 +1081,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
10791081 GGML_LOG_CONT (" ." );
10801082 }
10811083
1084+ // group_norm
1085+ {
1086+ #ifdef GGML_OPENCL_EMBED_KERNELS
1087+ const std::string kernel_src {
1088+ #include " group_norm.cl.h"
1089+ };
1090+ #else
1091+ const std::string kernel_src = read_file (" group_norm.cl" );
1092+ #endif
1093+ backend_ctx->program_group_norm =
1094+ build_program_from_source (backend_ctx->context , backend_ctx->device , kernel_src.c_str (), compile_opts);
1095+
1096+ CL_CHECK ((backend_ctx->kernel_group_norm = clCreateKernel (backend_ctx->program_group_norm , " kernel_group_norm" , &err), err));
1097+ GGML_LOG_CONT (" ." );
1098+ }
1099+
10821100 // Adreno kernels
10831101#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
10841102 // transpose
@@ -1970,6 +1988,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
19701988 case GGML_OP_NORM:
19711989 case GGML_OP_RMS_NORM:
19721990 return true ;
1991+ case GGML_OP_GROUP_NORM:
1992+ return ggml_is_contiguous (op->src [0 ]);
19731993 case GGML_OP_MUL_MAT:
19741994 if (op->src [0 ]->type == GGML_TYPE_F16) {
19751995 return true ;
@@ -4029,6 +4049,65 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
40294049#endif
40304050}
40314051
4052+ static void ggml_cl_group_norm (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4053+ GGML_ASSERT (src0);
4054+ GGML_ASSERT (src0->extra );
4055+ GGML_ASSERT (dst);
4056+ GGML_ASSERT (dst->extra );
4057+
4058+ UNUSED (src1);
4059+
4060+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
4061+ cl_command_queue queue = backend_ctx->queue ;
4062+
4063+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
4064+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
4065+
4066+ cl_ulong offset0 = extra0->offset + src0->view_offs ;
4067+ cl_ulong offsetd = extrad->offset + dst->view_offs ;
4068+
4069+ int32_t n_groups = ((const int32_t *) dst->op_params )[0 ];
4070+ int32_t group_size = src0->ne [0 ] * src0->ne [1 ] * ((src0->ne [2 ] + n_groups - 1 ) / n_groups);
4071+ float eps = ((const float *) dst->op_params )[1 ];
4072+
4073+ const int ne00 = src0->ne [0 ];
4074+ const int ne01 = src0->ne [1 ];
4075+ const int ne02 = src0->ne [2 ];
4076+ const int ne = ne00*ne01*ne02;
4077+
4078+ cl_kernel kernel = backend_ctx->kernel_group_norm ;
4079+
4080+ size_t sgs = 64 ;
4081+ if (backend_ctx->gpu_family == ADRENO) {
4082+ sgs = 64 ;
4083+ } else if (backend_ctx->gpu_family == INTEL) {
4084+ sgs = 32 ;
4085+ } else {
4086+ GGML_ASSERT (false && " Unsupported GPU" );
4087+ }
4088+
4089+ CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
4090+ CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
4091+ CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extrad->data_device ));
4092+ CL_CHECK (clSetKernelArg (kernel, 3 , sizeof (cl_ulong), &offsetd));
4093+ CL_CHECK (clSetKernelArg (kernel, 4 , sizeof (int ), &ne));
4094+ CL_CHECK (clSetKernelArg (kernel, 5 , sizeof (int ), &group_size));
4095+ CL_CHECK (clSetKernelArg (kernel, 6 , sizeof (float ), &eps));
4096+
4097+ size_t global_work_size[] = {(size_t )n_groups*sgs, 1 , 1 };
4098+ size_t local_work_size[] = {(size_t )sgs, 1 , 1 };
4099+
4100+ #ifdef GGML_OPENCL_PROFILING
4101+ cl_event evt;
4102+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , &evt));
4103+
4104+ g_profiling_info.emplace_back ();
4105+ populateProfilingInfo (g_profiling_info.back (), evt, kernel, global_work_size, local_work_size, dst);
4106+ #else
4107+ CL_CHECK (clEnqueueNDRangeKernel (queue, kernel, 3 , NULL , global_work_size, local_work_size, 0 , NULL , NULL ));
4108+ #endif
4109+ }
4110+
40324111static void ggml_cl_mul_mat (ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
40334112 GGML_ASSERT (src0);
40344113 GGML_ASSERT (src0->extra );
@@ -5609,6 +5688,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
56095688 }
56105689 func = ggml_cl_rms_norm;
56115690 break ;
5691+ case GGML_OP_GROUP_NORM:
5692+ if (!any_on_device) {
5693+ return false ;
5694+ }
5695+ func = ggml_cl_group_norm;
5696+ break ;
56125697 case GGML_OP_MUL_MAT:
56135698 if (!any_on_device && !ggml_cl_can_mul_mat (tensor->src [0 ], tensor->src [1 ], tensor)) {
56145699 return false ;
0 commit comments