ggml-org · taronaeo · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/common/common.h b/common/common.h
@@ -312,7 +312,7 @@ struct common_params {
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
+    enum llama_flash_attn_type   flash_attn_type   = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention
 
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;

diff --git a/docs/build-s390x.md b/docs/build-s390x.md
@@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
     cmake --build build --config Release -j $(nproc)
     ```
 
--   By default, NNPA is disabled by default. To enable it:
+-   By default, NNPA is enabled when available. To disable it (not recommended):
 
     ```bash
     cmake -S . -B build             \
         -DCMAKE_BUILD_TYPE=Release  \
         -DGGML_BLAS=ON              \
         -DGGML_BLAS_VENDOR=OpenBLAS \
-        -DGGML_NNPA=ON
+        -DGGML_NNPA=OFF
 
     cmake --build build --config Release -j $(nproc)
     ```
@@ -166,7 +166,7 @@ Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (t
 
 ### 2. NNPA Vector Intrinsics Acceleration
 
-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
 
 ### 3. zDNN Accelerator (WIP)
 
@@ -230,10 +230,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
     CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
     ```
 
-5. `-DGGML_NNPA=ON` generates gibberish output
-
-    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
-
 ## Getting Help on IBM Z & LinuxONE
 
 1. **Bugs, Feature Requests**
@@ -292,4 +288,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself
 
-Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.
+Last Updated by **Aaron Teo ([email protected])** on Sep 2, 2025.
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
@@ -132,7 +132,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
+option(GGML_NNPA             "ggml: enable nnpa"             ON)
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
@@ -105,6 +105,8 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
+    GGML_BACKEND_API int ggml_cpu_support_fattn  (void);  // whether Flash Attention is supported
+
     // Internal types and functions exposed for tests and benchmarks
 
     typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,

@@ -3219,7 +3219,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
         float32x4_t v_zero = vec_splats(0.0f);
         uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
         uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
-        vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
+        y[i + 0] = vec_extract(v_y, 0);
+        y[i + 1] = vec_extract(v_y, 1);
+        y[i + 2] = vec_extract(v_y, 2);
+        y[i + 3] = vec_extract(v_y, 3);
     }
 #endif
     for (; i < n; ++i) {
@@ -3521,6 +3524,16 @@ int ggml_cpu_has_sme(void) {
 #endif
 }
 
+int ggml_cpu_support_fattn(void) {
+#if defined(GGML_NNPA) || defined(__NNPA__)
+    // disable Flash Attention when using NNPA
+    // see: https://github.com/ggml-org/llama.cpp/issues/15721
+    return 0;
+#else
+    return 1;
+#endif
+}
+
 void ggml_cpu_init(void) {
     // needed to initialize ggml_time
     {