File tree Expand file tree Collapse file tree 3 files changed +20
-15
lines changed Expand file tree Collapse file tree 3 files changed +20
-15
lines changed Original file line number Diff line number Diff line change @@ -443,19 +443,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
443443 backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
444444 backend_ctx->adreno_gen = get_adreno_gpu_gen (default_device->name );
445445
446- // Default wave size is 128, A8x uses 64.
447- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
448- backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449- backend_ctx->adreno_wave_size = 64 ;
450- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
451- backend_ctx->adreno_wave_size = 128 ;
452- } else {
453- backend_ctx->adreno_wave_size = 128 ;
454- GGML_LOG_WARN (" ggml_opencl: Unsupported Adreno GPU: %s, "
455- " using wave size %d, "
456- " may not work as expected\n " ,
457- backend_ctx->device_name .c_str (), backend_ctx->adreno_wave_size );
458- }
446+ // Use wave size of 64 for all Adreno GPUs.
447+ backend_ctx->adreno_wave_size = 64 ;
459448 } else if (strstr (default_device->name , " Intel" )) {
460449 backend_ctx->gpu_family = GPU_FAMILY::INTEL;
461450 } else {
Original file line number Diff line number Diff line change 11#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22#pragma OPENCL EXTENSION cl_khr_subgroups : enable
33
4+ #ifdef cl_qcom_reqd_sub_group_size
5+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6+ #define ADRENO_GPU 1
7+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8+ #endif
9+
410// assume
511#define QK4_0 32
612#define N_SIMDGROUP 4
182188 total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
183189 total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
184190
185-
191+ #ifdef ADRENO_GPU
192+ REQD_SUBGROUP_SIZE_64
193+ #endif
186194__kernel void kernel_gemv_noshuffle (
187195 __read_only image1d_buffer_t src0_q , // quantized A
188196 global half2 * src0_d , // A scales
Original file line number Diff line number Diff line change 11#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22#pragma OPENCL EXTENSION cl_khr_subgroups : enable
33
4+ #ifdef cl_qcom_reqd_sub_group_size
5+ #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
6+ #define ADRENO_GPU 1
7+ #define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
8+ #endif
9+
410// assume
511#define QK4_0 32
612#define N_SIMDGROUP 4
182188 total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
183189 total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
184190
185-
191+ #ifdef ADRENO_GPU
192+ REQD_SUBGROUP_SIZE_64
193+ #endif
186194__kernel void kernel_gemv_noshuffle (
187195 __read_only image1d_buffer_t src0_q , // quantized A
188196 global half2 * src0_d , // A scales
You can’t perform that action at this time.
0 commit comments