try to fix some fattn inconsistencies

LostRuins · LostRuins · commit d7c2f27749d3 · 2025-11-27T01:55:26.000+08:00
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -306,7 +306,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
 
         //kcpp: use wmma to fix cu11 incoherence
         if (ggml_cuda_should_use_wmma_fattn(cc) && (ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_TURING)) {
-            return BEST_FATTN_KERNEL_WMMA_F16;
+            if(Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) //kcpp: these sizes not supported in wmma
+            {
+                return BEST_FATTN_KERNEL_WMMA_F16;
+            }
         }
 
         return BEST_FATTN_KERNEL_MMA_F16;
@@ -330,7 +333,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
         }
     }
     //kcpp: patch from previous version for my sanity. it worked before, idk it should work now.
-    if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
+    if ((Q->ne[1] <= 8 || Q->ne[0] == 256) && can_use_vector_kernel) {
         return BEST_FATTN_KERNEL_VEC;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -306,7 +306,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const`
`306`	`306`
`307`	`307`	`//kcpp: use wmma to fix cu11 incoherence`
`308`	`308`	`if (ggml_cuda_should_use_wmma_fattn(cc) && (ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING \|\| cc == GGML_CUDA_CC_TURING)) {`
`309`		`- return BEST_FATTN_KERNEL_WMMA_F16;`
	`309`	`+ if(Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) //kcpp: these sizes not supported in wmma`
	`310`	`+ {`
	`311`	`+ return BEST_FATTN_KERNEL_WMMA_F16;`
	`312`	`+ }`
`310`	`313`	`}`
`311`	`314`
`312`	`315`	`return BEST_FATTN_KERNEL_MMA_F16;`
`@@ -330,7 +333,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const`
`330`	`333`	`}`
`331`	`334`	`}`
`332`	`335`	`//kcpp: patch from previous version for my sanity. it worked before, idk it should work now.`
`333`		`- if (Q->ne[1] <= 8 \|\| Q->ne[0] == 256) {`
	`336`	`+ if ((Q->ne[1] <= 8 \|\| Q->ne[0] == 256) && can_use_vector_kernel) {`
`334`	`337`	`return BEST_FATTN_KERNEL_VEC;`
`335`	`338`	`}`
`336`	`339`