Skip to content

Commit d55ea5e

Browse files
quic-sszotlhez
authored andcommitted
opencl: use wave size of 64 for all Adreno GPUs
1 parent 97151f4 commit d55ea5e

File tree

3 files changed

+20
-15
lines changed

3 files changed

+20
-15
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -443,19 +443,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
443443
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
444444
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
445445

446-
// Default wave size is 128, A8x uses 64.
447-
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
448-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449-
backend_ctx->adreno_wave_size = 64;
450-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
451-
backend_ctx->adreno_wave_size = 128;
452-
} else {
453-
backend_ctx->adreno_wave_size = 128;
454-
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
455-
"using wave size %d, "
456-
"may not work as expected\n",
457-
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
458-
}
446+
// Use wave size of 64 for all Adreno GPUs.
447+
backend_ctx->adreno_wave_size = 64;
459448
} else if (strstr(default_device->name, "Intel")) {
460449
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
461450
} else {

ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
33

4+
#ifdef cl_qcom_reqd_sub_group_size
5+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6+
#define ADRENO_GPU 1
7+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8+
#endif
9+
410
// assume
511
#define QK4_0 32
612
#define N_SIMDGROUP 4
@@ -182,7 +188,9 @@
182188
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
183189
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
184190

185-
191+
#ifdef ADRENO_GPU
192+
REQD_SUBGROUP_SIZE_64
193+
#endif
186194
__kernel void kernel_gemv_noshuffle(
187195
__read_only image1d_buffer_t src0_q, // quantized A
188196
global half2 * src0_d, // A scales

ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
33

4+
#ifdef cl_qcom_reqd_sub_group_size
5+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6+
#define ADRENO_GPU 1
7+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8+
#endif
9+
410
// assume
511
#define QK4_0 32
612
#define N_SIMDGROUP 4
@@ -182,7 +188,9 @@
182188
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
183189
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
184190

185-
191+
#ifdef ADRENO_GPU
192+
REQD_SUBGROUP_SIZE_64
193+
#endif
186194
__kernel void kernel_gemv_noshuffle(
187195
__read_only image1d_buffer_t src0_q, // quantized A
188196
global half2 * src0_d, // A scales

0 commit comments

Comments
 (0)