Skip to content

Commit fde5231

Browse files
committed
ggml-cpu: undo fattn override for nnpa
Signed-off-by: Aaron Teo <[email protected]>
1 parent a59f362 commit fde5231

File tree

3 files changed

+1
-13
lines changed

3 files changed

+1
-13
lines changed

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ struct common_params {
312312
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
313313
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
314314
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
315-
enum llama_flash_attn_type flash_attn_type = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention
315+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
316316

317317
struct common_params_sampling sampling;
318318
struct common_params_speculative speculative;

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,6 @@ extern "C" {
105105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106106
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107107

108-
GGML_BACKEND_API int ggml_cpu_support_fattn (void); // whether Flash Attention is supported
109-
110108
// Internal types and functions exposed for tests and benchmarks
111109

112110
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3524,16 +3524,6 @@ int ggml_cpu_has_sme(void) {
35243524
#endif
35253525
}
35263526

3527-
int ggml_cpu_support_fattn(void) {
3528-
#if defined(GGML_NNPA) || defined(__NNPA__)
3529-
// disable Flash Attention when using NNPA
3530-
// see: https://github.com/ggml-org/llama.cpp/issues/15721
3531-
return 0;
3532-
#else
3533-
return 1;
3534-
#endif
3535-
}
3536-
35373527
void ggml_cpu_init(void) {
35383528
// needed to initialize ggml_time
35393529
{

0 commit comments

Comments
 (0)