Review: use warp_size, fix should_use_mmf condition

am17an · am17an · commit 0c7f7f3934fa · 2025-09-04T10:43:42.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2030,15 +2030,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             const int cc            = ggml_cuda_info().devices[id].cc;
             const int warp_size     = ggml_cuda_info().devices[id].warp_size;
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne);
+            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
             use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
         const int cc            = ggml_cuda_info().devices[ctx.device].cc;
         const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne);
+        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]);
         use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
@@ -2110,7 +2110,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
             return;
         }
 
-        if ( !ggml_is_quantized(src0->type ) && ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne, ids)) {
+        if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], ids)) {
             ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
             return;
         }
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
@@ -32,7 +32,7 @@ static __global__ void mul_mat_f(
 
     if (ids) {
         int match = 0;
-        for(int j0 = 0; j0 < cols_per_block; j0 += warpSize) {
+        for(int j0 = 0; j0 < cols_per_block; j0 += warp_size) {
             const int j = j0 + threadIdx.x;
             if(j < cols_per_block) {
                 match = ids[j*stride_row_id + channel_dst*stride_col_id] == expert_idx;
@@ -451,18 +451,23 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
     }
 }
 
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int64_t * src1_ne, const ggml_tensor * ids) {
+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, const ggml_tensor * ids) {
+
+    if (ggml_is_quantized(type)) {
+        return false;
+    }
+
     if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
         return false;
     }
     if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
         return false;
     }
-    if (!ids && src1_ne[1] > 16) {
+    if (!ids && src1_ncols > 16) {
         return false;
     }
 
-    if (ids && src1_ne[2] > 16) {
+    if (ids && src1_ncols > 16) {
         return false;
     }
 
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
@@ -2,4 +2,4 @@
 
 void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
 
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int64_t * src1_ne, const ggml_tensor * ids = nullptr);
+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, const ggml_tensor * ids = nullptr);

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ static __global__ void mul_mat_f(`
`32`	`32`
`33`	`33`	`if (ids) {`
`34`	`34`	`int match = 0;`
`35`		`- for(int j0 = 0; j0 < cols_per_block; j0 += warpSize) {`
	`35`	`+ for(int j0 = 0; j0 < cols_per_block; j0 += warp_size) {`
`36`	`36`	`const int j = j0 + threadIdx.x;`
`37`	`37`	`if(j < cols_per_block) {`
`38`	`38`	`match = ids[jstride_row_id + channel_dststride_col_id] == expert_idx;`
`@@ -451,18 +451,23 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr`
`451`	`451`	`}`
`452`	`452`	`}`
`453`	`453`
`454`		`-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int64_t * src1_ne, const ggml_tensor * ids) {`
	`454`	`+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, const ggml_tensor * ids) {`
	`455`	`+`
	`456`	`+ if (ggml_is_quantized(type)) {`
	`457`	`+ return false;`
	`458`	`+ }`
	`459`	`+`
`455`	`460`	`if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {`
`456`	`461`	`return false;`
`457`	`462`	`}`
`458`	`463`	`if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {`
`459`	`464`	`return false;`
`460`	`465`	`}`
`461`		`- if (!ids && src1_ne[1] > 16) {`
	`466`	`+ if (!ids && src1_ncols > 16) {`
`462`	`467`	`return false;`
`463`	`468`	`}`
`464`	`469`
`465`		`- if (ids && src1_ne[2] > 16) {`
	`470`	`+ if (ids && src1_ncols > 16) {`
`466`	`471`	`return false;`
`467`	`472`	`}`
`468`	`473`
Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`
`3`	`3`	`void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);`
`4`	`4`
`5`		`-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int64_t * src1_ne, const ggml_tensor * ids = nullptr);`
	`5`	`+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, const ggml_tensor * ids = nullptr);`