ggml-cpu: switch flash attention disable to ggml-cpu

taronaeo · taronaeo · commit 1edd6ed4dc80 · 2025-09-02T16:22:34.000+08:00
Signed-off-by: Aaron Teo &lt;aaron.teo1@ibm.com&gt;
diff --git a/common/common.h b/common/common.h
@@ -312,13 +312,7 @@ struct common_params {
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    #if defined(GGML_NNPA) || defined(__NNPA__)
-    // disable Flash Attention on NNPA
-    // see: https://github.com/ggml-org/llama.cpp/issues/15721
-    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_DISABLED;
-    #else
     enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
-    #endif
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -441,6 +441,15 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
         case GGML_OP_OUT_PROD:
             return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
                 src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
+        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_FLASH_ATTN_BACK:
+#if defined(GGML_NNPA) || defined(__NNPA__)
+            // disable Flash Attention on NNPA
+            // see: https://github.com/ggml-org/llama.cpp/issues/15721
+            return false;
+#else
+            return true;
+#endif  // GGML_NNPA || __NNPA__
         default:
             return true;
     }