@@ -3358,8 +3358,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
33583358 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
33593359 cl_command_queue queue = backend_ctx->queue ;
33603360
3361- ggml_backend_opencl_device_context * dev_ctx =
3362- (ggml_backend_opencl_device_context *)backend->device ->context ;
3361+ // ggml_backend_opencl_device_context * dev_ctx =
3362+ // (ggml_backend_opencl_device_context *)backend->device->context;
33633363
33643364 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
33653365 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
@@ -3390,13 +3390,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
33903390
33913391 // Note, this kernel declares local memory in kernel args and the size
33923392 // depends on subgroup size.
3393- // Retrieve subgroup size.
33943393 // Note, this requires OpenCL 2.1 and above
3394+ // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
33953395 size_t sgs;
3396- CL_CHECK (clGetKernelSubGroupInfo (kernel, dev_ctx->device ,
3397- CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398- sizeof (local_work_size), local_work_size,
3399- sizeof (size_t ), &sgs, NULL ));
3396+ // CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3397+ // CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398+ // sizeof(local_work_size), local_work_size,
3399+ // sizeof(size_t), &sgs, NULL));
3400+ if (backend_ctx->gpu_family == ADRENO) {
3401+ sgs = 64 ;
3402+ } else if (backend_ctx->gpu_family == INTEL) {
3403+ sgs = 32 ;
3404+ } else {
3405+ GGML_ASSERT (false && " Unsupported GPU" );
3406+ }
34003407
34013408 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
34023409 CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
0 commit comments