Skip to content

Commit 14c870d

Browse files
committed
ggml-cpu: stabilise nnpa fp32<->fp16
Signed-off-by: Aaron Teo <[email protected]>
1 parent 5d804a4 commit 14c870d

File tree

2 files changed

+10
-1
lines changed

2 files changed

+10
-1
lines changed

common/common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,13 @@ struct common_params {
312312
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
313313
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
314314
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
315+
#if defined(GGML_NNPA) || defined(__NNPA__)
316+
// disable Flash Attention on NNPA
317+
// see: https://github.com/ggml-org/llama.cpp/issues/15721
318+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
319+
#else
315320
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
321+
#endif
316322

317323
struct common_params_sampling sampling;
318324
struct common_params_speculative speculative;

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3219,7 +3219,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
32193219
float32x4_t v_zero = vec_splats(0.0f);
32203220
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
32213221
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3222-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3222+
y[i + 0] = vec_extract(v_y, 0);
3223+
y[i + 1] = vec_extract(v_y, 1);
3224+
y[i + 2] = vec_extract(v_y, 2);
3225+
y[i + 3] = vec_extract(v_y, 3);
32233226
}
32243227
#endif
32253228
for (; i < n; ++i) {

0 commit comments

Comments
 (0)