@@ -4563,13 +4563,6 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
4563
4563
4564
4564
GGML_ASSERT (ne00 % 4 == 0 );
4565
4565
4566
- const int nth = MIN (64 , ne00);
4567
-
4568
- size_t global_work_size[] = {(size_t )ne01*nth, (size_t )ne02, (size_t )ne03};
4569
- size_t local_work_size[] = {(size_t )nth, 1 , 1 };
4570
-
4571
- cl_kernel kernel = backend_ctx->kernel_rms_norm_mul ;
4572
-
4573
4566
size_t sgs;
4574
4567
if (backend_ctx->gpu_family == ADRENO) {
4575
4568
sgs = 64 ;
@@ -4579,6 +4572,19 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
4579
4572
GGML_ASSERT (false && " Unsupported GPU" );
4580
4573
}
4581
4574
4575
+ cl_kernel kernel = backend_ctx->kernel_rms_norm_mul ;
4576
+
4577
+ int nth = sgs;
4578
+ int max_workgroup_size = backend_ctx->get_kernel_workgroup_size (kernel);
4579
+ while (nth < ne00 && nth < max_workgroup_size) {
4580
+ nth *= 2 ;
4581
+ }
4582
+ nth = MIN (nth, max_workgroup_size);
4583
+ nth = MIN (nth, ne00);
4584
+
4585
+ size_t global_work_size[] = {(size_t )ne01*nth, (size_t )ne02, (size_t )ne03};
4586
+ size_t local_work_size[] = {(size_t )nth, 1 , 1 };
4587
+
4582
4588
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
4583
4589
CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
4584
4590
CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
0 commit comments