@@ -3358,8 +3358,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3358
3358
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context ;
3359
3359
cl_command_queue queue = backend_ctx->queue ;
3360
3360
3361
- ggml_backend_opencl_device_context * dev_ctx =
3362
- (ggml_backend_opencl_device_context *)backend->device ->context ;
3361
+ // ggml_backend_opencl_device_context * dev_ctx =
3362
+ // (ggml_backend_opencl_device_context *)backend->device->context;
3363
3363
3364
3364
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra ;
3365
3365
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra ;
@@ -3390,13 +3390,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
3390
3390
3391
3391
// Note, this kernel declares local memory in kernel args and the size
3392
3392
// depends on subgroup size.
3393
- // Retrieve subgroup size.
3394
3393
// Note, this requires OpenCL 2.1 and above
3394
+ // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
3395
3395
size_t sgs;
3396
- CL_CHECK (clGetKernelSubGroupInfo (kernel, dev_ctx->device ,
3397
- CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398
- sizeof (local_work_size), local_work_size,
3399
- sizeof (size_t ), &sgs, NULL ));
3396
+ // CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3397
+ // CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398
+ // sizeof(local_work_size), local_work_size,
3399
+ // sizeof(size_t), &sgs, NULL));
3400
+ if (backend_ctx->gpu_family == ADRENO) {
3401
+ sgs = 64 ;
3402
+ } else if (backend_ctx->gpu_family == INTEL) {
3403
+ sgs = 32 ;
3404
+ } else {
3405
+ GGML_ASSERT (false && " Unsupported GPU" );
3406
+ }
3400
3407
3401
3408
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
3402
3409
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
0 commit comments