ggml-cpu: undo fattn override for nnpa

taronaeo · taronaeo · commit fde523146e5f · 2025-09-03T02:02:22.000+08:00
Signed-off-by: Aaron Teo &lt;aaron.teo1@ibm.com&gt;
diff --git a/common/common.h b/common/common.h
@@ -312,7 +312,7 @@ struct common_params {
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    enum llama_flash_attn_type   flash_attn_type   = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention
+    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
@@ -105,8 +105,6 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
-    GGML_BACKEND_API int ggml_cpu_support_fattn  (void);  // whether Flash Attention is supported
-
     // Internal types and functions exposed for tests and benchmarks
 
     typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3524,16 +3524,6 @@ int ggml_cpu_has_sme(void) {
 #endif
 }
 
-int ggml_cpu_support_fattn(void) {
-#if defined(GGML_NNPA) || defined(__NNPA__)
-    // disable Flash Attention when using NNPA
-    // see: https://github.com/ggml-org/llama.cpp/issues/15721
-    return 0;
-#else
-    return 1;
-#endif
-}
-
 void ggml_cpu_init(void) {
     // needed to initialize ggml_time
     {