opencl: fix small shape gemv, remove unused extensions

shawngu-quic · lhez · commit b0a765c086c1 · 2025-02-13T22:16:56.000-08:00
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -444,10 +444,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
 
         // Default wave size is 128, A8x uses 64.
-        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
+        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
+            backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
             backend_ctx->adreno_wave_size = 64;
-        } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
-                   backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
+        } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
             backend_ctx->adreno_wave_size = 128;
         } else {
             backend_ctx->adreno_wave_size = 128;
@@ -3002,11 +3002,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         }
 
         if (N == 1) {
-            local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
+            size_t wavesize = backend_ctx->adreno_wave_size;
+            local_work_size[0] = wavesize; // localsize
             local_work_size[1] = 4; // reduce factor
             local_work_size[2] = 1;
 
-            global_work_size[0] = M / 2;
+            global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
             global_work_size[1] = 4; // reduce factor
             global_work_size[2] = 1;
         }
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@@ -1,9 +1,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
-#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 
 // assume
 #define QK4_0 32
@@ -187,7 +183,6 @@
     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
 
 
-__attribute__((qcom_reqd_sub_group_size("full")))
 __kernel void kernel_gemv_noshuffle(
         __read_only  image1d_buffer_t src0_q,  // quantized A
         global half2  * src0_d,  // A scales
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@@ -1,9 +1,5 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
-#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 
 // assume
 #define QK4_0 32
@@ -187,7 +183,6 @@
     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
 
 
-__attribute__((qcom_reqd_sub_group_size("full")))
 __kernel void kernel_gemv_noshuffle(
         __read_only  image1d_buffer_t src0_q,  // quantized A
         global half2  * src0_d,  // A scales