CUDA: fix Volta FlashAttention logic (ggml-org#11615)

Nexesenex · Nexesenex · commit 2661a0704f78 · 2025-02-04T05:09:15.000+01:00
Author : Johannes Gaessler
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -561,7 +561,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_ten
                     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
                     break;
                 // case 256:
-                //     ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
+                //     ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
                 //     break;
                 default:
                     GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -334,7 +334,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
         return;
     }
 
-    if (!new_mma_available(cc)) {
+    if (!fp16_mma_available(cc)) {
         if (prec == GGML_PREC_DEFAULT) {
             if (Q->ne[1] <= 8) {
                 ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
@@ -364,6 +364,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
     // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
     if (cc == GGML_CUDA_CC_VOLTA) {
         ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
+        return;
     }
 
     ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);

Original file line number	Diff line number	Diff line change
`@@ -334,7 +334,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst`
`334`	`334`	`return;`
`335`	`335`	`}`
`336`	`336`
`337`		`- if (!new_mma_available(cc)) {`
	`337`	`+ if (!fp16_mma_available(cc)) {`
`338`	`338`	`if (prec == GGML_PREC_DEFAULT) {`
`339`	`339`	`if (Q->ne[1] <= 8) {`
`340`	`340`	`ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);`
`@@ -364,6 +364,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst`
`364`	`364`	`// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:`
`365`	`365`	`if (cc == GGML_CUDA_CC_VOLTA) {`
`366`	`366`	`ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);`
	`367`	`+ return;`
`367`	`368`	`}`
`368`	`369`
`369`	`370`	`ggml_cuda_flash_attn_ext_mma_f16(ctx, dst);`