Skip to content

Commit 9bb0ea2

Browse files
committed
Merge branch 'nb_5710' into crokeso
2 parents bcea211 + 381174b commit 9bb0ea2

32 files changed

+3660
-497
lines changed

common/arg.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3213,6 +3213,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32133213
params.speculative.model.path = value;
32143214
}
32153215
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3216+
add_opt(common_arg(
3217+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3218+
string_format(
3219+
"KV cache data type for K for the draft model\n"
3220+
"allowed values: %s\n"
3221+
"(default: %s)",
3222+
get_all_kv_cache_types().c_str(),
3223+
ggml_type_name(params.speculative.cache_type_k)
3224+
),
3225+
[](common_params & params, const std::string & value) {
3226+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3227+
}
3228+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3229+
add_opt(common_arg(
3230+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3231+
string_format(
3232+
"KV cache data type for V for the draft model\n"
3233+
"allowed values: %s\n"
3234+
"(default: %s)",
3235+
get_all_kv_cache_types().c_str(),
3236+
ggml_type_name(params.speculative.cache_type_v)
3237+
),
3238+
[](common_params & params, const std::string & value) {
3239+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3240+
}
3241+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32163242

32173243
add_opt(common_arg(
32183244
{"-mv", "--model-vocoder"}, "FNAME",

common/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,11 +714,17 @@ bool fs_validate_filename(const std::string & filename) {
714714
// disable C++17 deprecation warning for std::codecvt_utf8
715715
# pragma clang diagnostic push
716716
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
717+
#elif defined(__GNUC__)
718+
# pragma GCC diagnostic push
719+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
717720
#endif
721+
718722
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
719723

720724
#if defined(__clang__)
721725
# pragma clang diagnostic pop
726+
#elif defined(__GNUC__)
727+
# pragma GCC diagnostic pop
722728
#endif
723729

724730
filename_utf32 = converter.from_bytes(filename);

common/common.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ struct common_params_speculative {
195195
float p_split = 0.1f; // speculative decoding split probability
196196
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
197197

198+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
199+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
200+
198201
struct cpu_params cpuparams;
199202
struct cpu_params cpuparams_batch;
200203

convert_hf_to_gguf.py

100755100644
Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6718,8 +6718,8 @@ def parse_args() -> argparse.Namespace:
67186718
help="model is executed on big endian machine",
67196719
)
67206720
parser.add_argument(
6721-
"model", type=Path,
6722-
help="directory containing model file",
6721+
"model", type=str,
6722+
help="directory containing model file or huggingface repository ID (if --remote)",
67236723
nargs="?",
67246724
)
67256725
parser.add_argument(
@@ -6826,18 +6826,20 @@ def main() -> None:
68266826
else:
68276827
logging.basicConfig(level=logging.INFO)
68286828

6829-
dir_model = args.model
6830-
68316829
if args.remote:
6830+
hf_repo_id = args.model
68326831
from huggingface_hub import snapshot_download
68336832
local_dir = snapshot_download(
6834-
repo_id=str(dir_model),
6833+
repo_id=hf_repo_id,
68356834
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
68366835
dir_model = Path(local_dir)
68376836
logger.info(f"Downloaded config and tokenizer to {local_dir}")
6837+
else:
6838+
hf_repo_id = None
6839+
dir_model = Path(args.model)
68386840

68396841
if not dir_model.is_dir():
6840-
logger.error(f'Error: {args.model} is not a directory')
6842+
logger.error(f'Error: {dir_model} is not a directory')
68416843
sys.exit(1)
68426844

68436845
ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6910,9 +6912,9 @@ def main() -> None:
69106912

69116913
if args.outfile is not None:
69126914
fname_out = args.outfile
6913-
elif args.remote:
6915+
elif hf_repo_id:
69146916
# if remote, use the model ID as the output file name
6915-
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6917+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
69166918
else:
69176919
fname_out = dir_model
69186920

@@ -6942,7 +6944,7 @@ def main() -> None:
69426944
split_max_tensors=args.split_max_tensors,
69436945
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
69446946
small_first_shard=args.no_tensor_first_split,
6945-
remote_hf_model_id=str(args.model) if args.remote else None,
6947+
remote_hf_model_id=hf_repo_id,
69466948
thread_count=args.threads)
69476949

69486950
if args.vocab_only:

docs/build-s390x.md

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
> [!IMPORTANT]
2+
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
3+
4+
# Build llama.cpp locally (for s390x)
5+
6+
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
7+
8+
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
9+
10+
**To get the code:**
11+
12+
```bash
13+
git clone https://github.com/ggml-org/llama.cpp
14+
cd llama.cpp
15+
```
16+
17+
## CPU Build with BLAS
18+
19+
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements.
20+
21+
```bash
22+
cmake -S . -B build \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DGGML_BLAS=ON \
25+
-DGGML_BLAS_VENDOR=OpenBLAS
26+
27+
cmake --build build --config Release -j $(nproc)
28+
```
29+
30+
**Notes**:
31+
- For faster repeated compilation, install [ccache](https://ccache.dev/)
32+
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
33+
34+
```bash
35+
cmake -S . -B build \
36+
-DCMAKE_BUILD_TYPE=Release \
37+
-DGGML_BLAS=ON \
38+
-DGGML_BLAS_VENDOR=OpenBLAS \
39+
-DGGML_VXE=OFF
40+
41+
cmake --build build --config Release -j $(nproc)
42+
```
43+
44+
- For debug builds:
45+
46+
```bash
47+
cmake -S . -B build \
48+
-DCMAKE_BUILD_TYPE=Debug \
49+
-DGGML_BLAS=ON \
50+
-DGGML_BLAS_VENDOR=OpenBLAS
51+
52+
cmake --build build --config Debug -j $(nproc)
53+
```
54+
55+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
56+
57+
```bash
58+
cmake -S . -B build \
59+
-DCMAKE_BUILD_TYPE=Release \
60+
-DGGML_BLAS=ON \
61+
-DGGML_BLAS_VENDOR=OpenBLAS \
62+
-DBUILD_SHARED_LIBS=OFF
63+
64+
cmake --build build --config Release -j $(nproc)
65+
```
66+
67+
## Getting GGUF Models
68+
69+
All models need to be converted to Big-Endian. You can achieve this in three cases:
70+
71+
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
72+
73+
You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
74+
75+
These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE.
76+
77+
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
78+
79+
```bash
80+
python3 convert_hf_to_gguf.py \
81+
--outfile model-name-be.f16.gguf \
82+
--outtype f16 \
83+
--bigendian \
84+
model-directory/
85+
```
86+
87+
For example,
88+
89+
```bash
90+
python3 convert_hf_to_gguf.py \
91+
--outfile granite-3.3-2b-instruct-be.f16.gguf \
92+
--outtype f16 \
93+
--bigendian \
94+
granite-3.3-2b-instruct/
95+
```
96+
97+
3. **Convert existing GGUF Little-Endian model to Big-Endian**
98+
99+
```bash
100+
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
101+
```
102+
103+
For example,
104+
```bash
105+
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
106+
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
107+
```
108+
109+
**Notes:**
110+
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
111+
112+
## IBM Accelerators
113+
114+
### 1. SIMD Acceleration
115+
116+
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
117+
118+
### 2. zDNN Accelerator
119+
120+
*Only available in IBM z16 or later system. No direction at the moment.*
121+
122+
### 3. Spyre Accelerator
123+
124+
*No direction at the moment.*
125+
126+
## Performance Tuning
127+
128+
### 1. Virtualization Setup
129+
130+
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
131+
132+
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
133+
134+
### 2. IFL (Core) Count
135+
136+
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
137+
138+
Note: IFL count does not equate to vCPU count.
139+
140+
### 3. SMT vs NOSMT (Simultaneous Multithreading)
141+
142+
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
143+
144+
### 4. BLAS vs NOBLAS
145+
146+
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
147+
148+
## Getting Help on IBM Z & LinuxONE
149+
150+
1. **Bugs, Feature Requests**
151+
152+
Please file an issue in llama.cpp and ensure that the title contains "s390x".
153+
154+
2. **Other Questions**
155+
156+
Please reach out directly to [[email protected]](mailto:[email protected]).
157+

ggml/src/ggml-backend-reg.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@
6969
#if defined(__clang__)
7070
# pragma clang diagnostic push
7171
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
72+
#elif defined(__GNUC__)
73+
# pragma GCC diagnostic push
74+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
7275
#endif
7376

7477
namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
9194

9295
#if defined(__clang__)
9396
# pragma clang diagnostic pop
97+
#elif defined(__GNUC__)
98+
# pragma GCC diagnostic pop
9499
#endif
95100

96101
#ifdef _WIN32

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
371371
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
372372
#endif
373373

374-
typedef signed char char8x16_t __attribute__((vector_size(16)));
374+
typedef signed char char8x16_t __attribute__((vector_size(16)));
375375
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
376376

377377
typedef int8_t int8x16_t __attribute__((vector_size(16)));
@@ -382,10 +382,10 @@ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
382382
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
383383
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
384384

385-
typedef float float32x4_t __attribute__((vector_size(16)));
386-
typedef double double64x2_t __attribute((vector_size(16)));
385+
typedef float float32x4_t __attribute__((vector_size(16)));
386+
typedef double double64x2_t __attribute__((vector_size(16)));
387387

388-
typedef signed long long long64x2_t __attribute((vector_size(16)));
388+
typedef signed long long long64x2_t __attribute__((vector_size(16)));
389389
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
390390

391391
typedef struct ggml_uint8x16x2_t {

0 commit comments

Comments
 (0)