Skip to content

Commit b0a765c

Browse files
shawngu-quiclhez
authored andcommitted
opencl: fix small shape gemv, remove unused extensions
1 parent 04045bb commit b0a765c

File tree

3 files changed

+6
-15
lines changed

3 files changed

+6
-15
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -444,10 +444,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444444
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
445445

446446
// Default wave size is 128, A8x uses 64.
447-
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
447+
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
448+
backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
448449
backend_ctx->adreno_wave_size = 64;
449-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
450-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
450+
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
451451
backend_ctx->adreno_wave_size = 128;
452452
} else {
453453
backend_ctx->adreno_wave_size = 128;
@@ -3002,11 +3002,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
30023002
}
30033003

30043004
if (N == 1) {
3005-
local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
3005+
size_t wavesize = backend_ctx->adreno_wave_size;
3006+
local_work_size[0] = wavesize; // localsize
30063007
local_work_size[1] = 4; // reduce factor
30073008
local_work_size[2] = 1;
30083009

3009-
global_work_size[0] = M / 2;
3010+
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
30103011
global_work_size[1] = 4; // reduce factor
30113012
global_work_size[2] = 1;
30123013
}

ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
3-
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
4-
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
5-
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
6-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
73

84
// assume
95
#define QK4_0 32
@@ -187,7 +183,6 @@
187183
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
188184

189185

190-
__attribute__((qcom_reqd_sub_group_size("full")))
191186
__kernel void kernel_gemv_noshuffle(
192187
__read_only image1d_buffer_t src0_q, // quantized A
193188
global half2 * src0_d, // A scales

ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
3-
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
4-
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
5-
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
6-
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
73

84
// assume
95
#define QK4_0 32
@@ -187,7 +183,6 @@
187183
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
188184

189185

190-
__attribute__((qcom_reqd_sub_group_size("full")))
191186
__kernel void kernel_gemv_noshuffle(
192187
__read_only image1d_buffer_t src0_q, // quantized A
193188
global half2 * src0_d, // A scales

0 commit comments

Comments
 (0)