Skip to content

Commit a06517e

Browse files
committed
opencl: specify subgroup size instead of querying it
1 parent a423f7c commit a06517e

File tree

2 files changed

+35
-7
lines changed

2 files changed

+35
-7
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3358,8 +3358,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
33583358
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
33593359
cl_command_queue queue = backend_ctx->queue;
33603360

3361-
ggml_backend_opencl_device_context * dev_ctx =
3362-
(ggml_backend_opencl_device_context *)backend->device->context;
3361+
//ggml_backend_opencl_device_context * dev_ctx =
3362+
// (ggml_backend_opencl_device_context *)backend->device->context;
33633363

33643364
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
33653365
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
@@ -3390,13 +3390,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
33903390

33913391
// Note, this kernel declares local memory in kernel args and the size
33923392
// depends on subgroup size.
3393-
// Retrieve subgroup size.
33943393
// Note, this requires OpenCL 2.1 and above
3394+
// For now we use fixed subgroup size to simplify support for OpenCL 2.0.
33953395
size_t sgs;
3396-
CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3397-
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398-
sizeof(local_work_size), local_work_size,
3399-
sizeof(size_t), &sgs, NULL));
3396+
//CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
3397+
// CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
3398+
// sizeof(local_work_size), local_work_size,
3399+
// sizeof(size_t), &sgs, NULL));
3400+
if (backend_ctx->gpu_family == ADRENO) {
3401+
sgs = 64;
3402+
} else if (backend_ctx->gpu_family == INTEL) {
3403+
sgs = 32;
3404+
} else {
3405+
GGML_ASSERT(false && "Unsupported GPU");
3406+
}
34003407

34013408
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
34023409
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));

ggml/src/ggml-opencl/kernels/rms_norm.cl

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,31 @@
1515
#error "Selecting subgroup size is not supported on your device."
1616
#endif
1717

18+
#ifdef cl_intel_required_subgroup_size
19+
// Always use subgroup size of 32 on Intel.
20+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
21+
#define INTEL_GPU 1
22+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
23+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
24+
#elif defined(cl_qcom_reqd_sub_group_size)
25+
// Always use subgroups size of 64 on Adreno.
26+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
27+
#define ADRENO_GPU 1
28+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
29+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
30+
#else
31+
#error "Selecting subgroup size is not supported on your device."
32+
#endif
33+
1834
//------------------------------------------------------------------------------
1935
// rms_norm
2036
//------------------------------------------------------------------------------
2137
// This kernel depends on subgroup size.
38+
#ifdef INTEL_GPU
39+
REQD_SUBGROUP_SIZE_32
40+
#elif defined (ADRENO_GPU)
41+
REQD_SUBGROUP_SIZE_64
42+
#endif
2243
kernel void kernel_rms_norm(
2344
global void * src0,
2445
ulong offset0,

0 commit comments

Comments
 (0)