Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ struct common_params {
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
enum llama_flash_attn_type flash_attn_type = ggml_cpu_support_fattn() ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; // whether to use Flash Attention

struct common_params_sampling sampling;
struct common_params_speculative speculative;
Expand Down
12 changes: 4 additions & 8 deletions docs/build-s390x.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
```

- By default, NNPA is disabled by default. To enable it:
- By default, NNPA is enabled when available. To disable it (not recommended):

```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NNPA=ON
-DGGML_NNPA=OFF

cmake --build build --config Release -j $(nproc)
```
Expand Down Expand Up @@ -166,7 +166,7 @@ Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (t

### 2. NNPA Vector Intrinsics Acceleration

Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.

### 3. zDNN Accelerator (WIP)

Expand Down Expand Up @@ -230,10 +230,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
```

5. `-DGGML_NNPA=ON` generates gibberish output

Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.

## Getting Help on IBM Z & LinuxONE

1. **Bugs, Feature Requests**
Expand Down Expand Up @@ -292,4 +288,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself

Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.
Last Updated by **Aaron Teo ([email protected])** on Sep 2, 2025.
2 changes: 1 addition & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
option(GGML_NNPA "ggml: enable nnpa" ON)

option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);

GGML_BACKEND_API int ggml_cpu_support_fattn (void); // whether Flash Attention is supported

// Internal types and functions exposed for tests and benchmarks

typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
Expand Down
15 changes: 14 additions & 1 deletion ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -3219,7 +3219,10 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
y[i + 0] = vec_extract(v_y, 0);
y[i + 1] = vec_extract(v_y, 1);
y[i + 2] = vec_extract(v_y, 2);
y[i + 3] = vec_extract(v_y, 3);
}
#endif
for (; i < n; ++i) {
Expand Down Expand Up @@ -3521,6 +3524,16 @@ int ggml_cpu_has_sme(void) {
#endif
}

int ggml_cpu_support_fattn(void) {
#if defined(GGML_NNPA) || defined(__NNPA__)
// disable Flash Attention when using NNPA
// see: https://github.com/ggml-org/llama.cpp/issues/15721
return 0;
#else
return 1;
#endif
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not going to work. Either the problem with NNPA should be fixed, or NNPA support removed.


void ggml_cpu_init(void) {
// needed to initialize ggml_time
{
Expand Down
Loading