Skip to content

Commit 74be5d2

Browse files
committed
workgroup tuning for Hamoa
1 parent 81542bb commit 74be5d2

File tree

2 files changed

+9
-7
lines changed

2 files changed

+9
-7
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7754,8 +7754,8 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
77547754
if (true) { // condition todo
77557755
cl_int status;
77567756

7757-
size_t local_size[3] = {64, 4, 1};
7758-
size_t global_size[3] = {64, 4, 1};
7757+
size_t local_size[3] = {64, 2, 1};
7758+
size_t global_size[3] = {64, 2, 1};
77597759

77607760
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
77617761

@@ -7771,7 +7771,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
77717771

77727772
// set thread grid
77737773
global_size[0] = static_cast<size_t>(ne01);
7774+
global_size[1] = 4;
77747775
global_size[2] = static_cast<size_t>(ne20);
7776+
local_size[1] = 4;
77757777
} else { // for gemm
77767778
kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
77777779

ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
44

55
#define QK_MXFP4 32
6-
#define N_SIMDGROUP 4
6+
#define N_SIMDGROUP 2
77
#define SIMDGROUP_WIDTH 64
88
#define TILE_SIZE 320
99

@@ -142,12 +142,12 @@ __kernel void kernel_gemm_moe_mxfp4_f32(
142142
// reduction in local memory, assumes #subgroups=4
143143
__local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)];
144144
if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum;
145-
if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
146-
if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
145+
// if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum;
146+
// if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum;
147147
barrier(CLK_LOCAL_MEM_FENCE);
148148
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
149-
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
150-
if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
149+
// if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
150+
// if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
151151

152152
// 1 outputs per thread in subgroup 0
153153
if (sgid == 0) {

0 commit comments

Comments
 (0)