Skip to content

Commit 0ac20e3

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # docs/backend/SYCL.md # docs/build.md # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/amx/mmq.cpp # ggml/src/ggml-cpu/ggml-cpu.c # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/sycl_hw.cpp # ggml/src/ggml-sycl/sycl_hw.hpp # ggml/src/ggml-vulkan/CMakeLists.txt # tests/test-backend-ops.cpp
2 parents 39b0699 + 8846aac commit 0ac20e3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2177
-898
lines changed

convert_hf_to_gguf.py

Lines changed: 118 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,8 @@ def prepare_tensors(self):
310310
gguf.MODEL_TENSOR.POSNET_NORM2,
311311
gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312312
gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
313+
gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
314+
gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
313315
)
314316
)
315317
or not new_name.endswith(".weight")
@@ -320,7 +322,11 @@ def prepare_tensors(self):
320322
self.match_model_tensor_name(new_name, key, bid)
321323
for key in (
322324
gguf.MODEL_TENSOR.TOKEN_EMBD,
325+
gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
323326
gguf.MODEL_TENSOR.OUTPUT,
327+
gguf.MODEL_TENSOR.ALTUP_ROUTER,
328+
gguf.MODEL_TENSOR.LAUREL_L,
329+
gguf.MODEL_TENSOR.LAUREL_R,
324330
)
325331
):
326332
if self.ftype in (
@@ -921,13 +927,16 @@ def _create_vocab_sentencepiece(self):
921927
tokenizer = SentencePieceProcessor()
922928
tokenizer.LoadFromFile(str(tokenizer_path))
923929

924-
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
930+
vocab_size = self.find_hparam([
931+
"vocab_size_per_layer_input", # gemma3n
932+
"vocab_size",
933+
], optional=True) or tokenizer.vocab_size()
925934

926935
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
927936
scores: list[float] = [-10000.0] * vocab_size
928937
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
929938

930-
for token_id in range(tokenizer.vocab_size()):
939+
for token_id in range(vocab_size):
931940
piece = tokenizer.IdToPiece(token_id)
932941
text = piece.encode("utf-8")
933942
score = tokenizer.GetScore(token_id)
@@ -942,6 +951,10 @@ def _create_vocab_sentencepiece(self):
942951
elif tokenizer.IsByte(token_id):
943952
toktype = SentencePieceTokenTypes.BYTE
944953

954+
if token_id >= vocab_size:
955+
logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
956+
break
957+
945958
tokens[token_id] = text
946959
scores[token_id] = score
947960
toktypes[token_id] = toktype
@@ -4217,6 +4230,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
42174230
@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
42184231
class Gemma3Model(TextModel):
42194232
model_arch = gguf.MODEL_ARCH.GEMMA3
4233+
norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
42204234

42214235
def set_vocab(self):
42224236
self._set_vocab_sentencepiece()
@@ -4238,9 +4252,8 @@ def set_gguf_parameters(self):
42384252
self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
42394253
self.gguf_writer.add_file_type(self.ftype)
42404254
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
4241-
# both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
4255+
# attn_logit_softcapping is removed in Gemma3
42424256
assert hparams.get("attn_logit_softcapping") is None
4243-
assert hparams.get("final_logit_softcapping") is None
42444257
self.gguf_writer.add_sliding_window(hparams["sliding_window"])
42454258
self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
42464259
if hparams.get("rope_scaling") is not None:
@@ -4252,7 +4265,7 @@ def set_gguf_parameters(self):
42524265
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
42534266
del bid # unused
42544267

4255-
if name.startswith("language_model."):
4268+
if "language_model." in name:
42564269
name = name.replace("language_model.", "")
42574270

42584271
elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4267,8 +4280,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
42674280

42684281
# ref code in Gemma3RMSNorm
42694282
# output = output * (1.0 + self.weight.float())
4283+
# note: this is not the case on gemma3n
42704284
if name.endswith("norm.weight"):
4271-
data_torch = data_torch + 1
4285+
data_torch = data_torch + self.norm_shift
42724286

42734287
return [(self.map_tensor_name(name), data_torch)]
42744288

@@ -4325,6 +4339,104 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
43254339
return [] # skip other tensors
43264340

43274341

4342+
@ModelBase.register("Gemma3nForConditionalGeneration")
4343+
class Gemma3NModel(Gemma3Model):
4344+
model_arch = gguf.MODEL_ARCH.GEMMA3N
4345+
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
4346+
4347+
_altup_proj: list[Tensor] = []
4348+
_altup_unembd: list[Tensor] = []
4349+
4350+
def __init__(self, *args, **kwargs):
4351+
super().__init__(*args, **kwargs)
4352+
assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
4353+
self._altup_proj = [
4354+
torch.Tensor(), # to be replaced
4355+
torch.Tensor(), # to be replaced
4356+
torch.Tensor(), # to be replaced
4357+
]
4358+
self._altup_unembd = [
4359+
torch.Tensor(), # to be replaced
4360+
torch.Tensor(), # to be replaced
4361+
torch.Tensor(), # to be replaced
4362+
]
4363+
4364+
def set_vocab(self):
4365+
with open(self.dir_model / "chat_template.jinja") as f:
4366+
# quick hack to make sure chat template is added
4367+
self.gguf_writer.add_chat_template(f.read())
4368+
super().set_vocab()
4369+
4370+
def set_gguf_parameters(self):
4371+
super().set_gguf_parameters()
4372+
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
4373+
self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
4374+
self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
4375+
self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
4376+
4377+
activation_sparsity_scale = []
4378+
for s in self.hparams["activation_sparsity_pattern"]:
4379+
normal_dist = torch.distributions.normal.Normal(0, 1)
4380+
std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
4381+
activation_sparsity_scale.append(std_multiplier.item())
4382+
self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
4383+
4384+
sliding_window_pattern = []
4385+
for t in self.hparams["layer_types"]:
4386+
sliding_window_pattern.append(t == "sliding_attention")
4387+
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
4388+
4389+
def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
4390+
has_all = all(m.numel() > 0 for m in matrices)
4391+
if not has_all:
4392+
return None
4393+
else:
4394+
return torch.stack(matrices, dim=0)
4395+
4396+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4397+
if name.endswith("_scale"):
4398+
name = name + ".weight"
4399+
4400+
# TODO: implement self.prediction_coefs.weight.clamp_(...)
4401+
4402+
if "language_model." not in name:
4403+
return [] # skip non-language model tensors
4404+
4405+
if "altup_unembed_projections" in name:
4406+
data_torch = data_torch.to(device="cpu")
4407+
if ".0." in name:
4408+
self._altup_unembd[0] = data_torch
4409+
elif ".1." in name:
4410+
self._altup_unembd[1] = data_torch
4411+
elif ".2." in name:
4412+
self._altup_unembd[2] = data_torch
4413+
else:
4414+
raise ValueError(f"Unknown name: {name}")
4415+
out = self._stack_matrices(self._altup_unembd)
4416+
if out is not None:
4417+
return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
4418+
else:
4419+
return []
4420+
4421+
if "altup_projections" in name:
4422+
data_torch = data_torch.to(device="cpu")
4423+
if ".0." in name:
4424+
self._altup_proj[0] = data_torch
4425+
elif ".1." in name:
4426+
self._altup_proj[1] = data_torch
4427+
elif ".2." in name:
4428+
self._altup_proj[2] = data_torch
4429+
else:
4430+
raise ValueError(f"Unknown name: {name}")
4431+
out = self._stack_matrices(self._altup_proj)
4432+
if out is not None:
4433+
return [(self.map_tensor_name("model.altup_projections.weight"), out)]
4434+
else:
4435+
return []
4436+
4437+
return super().modify_tensors(data_torch, name, bid)
4438+
4439+
43284440
@ModelBase.register("Starcoder2ForCausalLM")
43294441
class StarCoder2Model(TextModel):
43304442
model_arch = gguf.MODEL_ARCH.STARCODER2

docs/build-s390x.md

Lines changed: 102 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ cd llama.cpp
1616

1717
## CPU Build with BLAS
1818

19-
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements.
19+
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment.
2020

2121
```bash
2222
cmake -S . -B build \
@@ -28,8 +28,9 @@ cmake --build build --config Release -j $(nproc)
2828
```
2929

3030
**Notes**:
31-
- For faster repeated compilation, install [ccache](https://ccache.dev/)
32-
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
31+
32+
- For faster repeated compilation, install [ccache](https://ccache.dev/)
33+
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
3334

3435
```bash
3536
cmake -S . -B build \
@@ -41,18 +42,29 @@ cmake --build build --config Release -j $(nproc)
4142
cmake --build build --config Release -j $(nproc)
4243
```
4344

44-
- For debug builds:
45+
- By default, NNPA is enabled when available. To disable it (not recommended):
46+
47+
```bash
48+
cmake -S . -B build \
49+
-DCMAKE_BUILD_TYPE=Release \
50+
-DGGML_BLAS=ON \
51+
-DGGML_BLAS_VENDOR=OpenBLAS \
52+
-DGGML_NNPA=OFF
53+
54+
cmake --build build --config Release -j $(nproc)
55+
```
56+
57+
- For debug builds:
4558

4659
```bash
4760
cmake -S . -B build \
4861
-DCMAKE_BUILD_TYPE=Debug \
4962
-DGGML_BLAS=ON \
5063
-DGGML_BLAS_VENDOR=OpenBLAS
51-
5264
cmake --build build --config Debug -j $(nproc)
5365
```
5466

55-
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
67+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
5668

5769
```bash
5870
cmake -S . -B build \
@@ -70,12 +82,18 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
7082

7183
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
7284

73-
You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
85+
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
7486

75-
These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE.
87+
You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
88+
89+
These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system.
7690

7791
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
7892

93+
![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28)
94+
95+
The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case.
96+
7997
```bash
8098
python3 convert_hf_to_gguf.py \
8199
--outfile model-name-be.f16.gguf \
@@ -96,32 +114,42 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
96114

97115
3. **Convert existing GGUF Little-Endian model to Big-Endian**
98116

117+
![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff)
118+
119+
The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case.
120+
99121
```bash
100122
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
101123
```
102124

103125
For example,
126+
104127
```bash
105128
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
106129
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
107130
```
108131

109132
**Notes:**
133+
110134
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
111135

112136
## IBM Accelerators
113137

114138
### 1. SIMD Acceleration
115139

116-
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
140+
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
117141

118-
### 2. zDNN Accelerator
142+
### 2. NNPA Vector Intrinsics Acceleration
119143

120-
*Only available in IBM z16 or later system. No direction at the moment.*
144+
Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
121145

122-
### 3. Spyre Accelerator
146+
### 3. zDNN Accelerator
123147

124-
*No direction at the moment.*
148+
_Only available in IBM z16 or later system. No direction at the moment._
149+
150+
### 4. Spyre Accelerator
151+
152+
_No direction at the moment._
125153

126154
## Performance Tuning
127155

@@ -145,6 +173,22 @@ It is strongly recommended to disable SMT via the kernel boot parameters as it n
145173
146174
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
147175
176+
## Frequently Asked Questions (FAQ)
177+
178+
1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?`
179+
180+
Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`.
181+
182+
You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian.
183+
184+
2. I'm getting extremely poor performance when running inference on a model
185+
186+
Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration.
187+
188+
3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17`
189+
190+
Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue.
191+
148192
## Getting Help on IBM Z & LinuxONE
149193

150194
1. **Bugs, Feature Requests**
@@ -155,3 +199,48 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
155199

156200
Please reach out directly to [[email protected]](mailto:[email protected]).
157201

202+
## Appendix A: Hardware Support Matrix
203+
204+
| | Support | Minimum Compiler Version |
205+
| ------- | ------- | ------------------------ |
206+
| IBM z15 || |
207+
| IBM z16 || |
208+
| IBM z17 || GCC 15.1.0 |
209+
210+
- ✅ - supported and verified to run as intended
211+
- 🚫 - unsupported, we are unlikely able to provide support
212+
213+
## Appendix B: SIMD Support Matrix
214+
215+
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
216+
| ---------- | ----------- | ---- | ---- | ----- |
217+
| FP32 |||||
218+
| FP16 |||||
219+
| BF16 | 🚫 | 🚫 |||
220+
| Q4_0 |||||
221+
| Q4_1 |||||
222+
| Q5_0 | 🚫 | 🚫 |||
223+
| Q5_1 | 🚫 | 🚫 |||
224+
| Q8_0 |||||
225+
| Q2_K | 🚫 | 🚫 |||
226+
| Q3_K |||||
227+
| Q4_K |||||
228+
| Q5_K |||||
229+
| Q6_K |||||
230+
| TQ1_0 | 🚫 | 🚫 |||
231+
| TQ2_0 | 🚫 | 🚫 |||
232+
| IQ2_XXS | 🚫 | 🚫 |||
233+
| IQ2_XS | 🚫 | 🚫 |||
234+
| IQ2_S | 🚫 | 🚫 |||
235+
| IQ3_XXS | 🚫 | 🚫 |||
236+
| IQ3_S | 🚫 | 🚫 |||
237+
| IQ1_S | 🚫 | 🚫 |||
238+
| IQ1_M | 🚫 | 🚫 |||
239+
| IQ4_NL |||||
240+
| IQ4_XS |||||
241+
| FP32->FP16 | 🚫 ||||
242+
| FP16->FP32 | 🚫 ||||
243+
244+
- ✅ - acceleration available
245+
- 🚫 - acceleration unavailable, will still run using scalar implementation
246+
- ❓ - acceleration unknown, please contribute if you can test it yourself

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104+
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104105
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105106
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106107

0 commit comments

Comments
 (0)