Skip to content

Commit b59b5db

Browse files
committed
Merge commit '456af35eb70177b8dd5779b6d4c21bb020f9cebd' into concedo_experimental
# Conflicts: # ggml/src/ggml-sycl/getrows.cpp # src/CMakeLists.txt # tools/llama-bench/llama-bench.cpp
2 parents 0ad95e8 + 456af35 commit b59b5db

28 files changed

+1404
-497
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,7 @@ embeddings_default.o: otherarch/embeddings_adapter.cpp
675675
$(CXX) $(CXXFLAGS) -c $< -o $@
676676

677677
# idiotic "for easier compilation"
678-
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-kv-cache-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
678+
GPTTYPE_ADAPTER = gpttype_adapter.cpp otherarch/llama_v2.cpp otherarch/llama_v3.cpp src/llama.cpp src/llama-impl.cpp src/llama-chat.cpp src/llama-mmap.cpp src/llama-context.cpp src/llama-adapter.cpp src/llama-arch.cpp src/llama-batch.cpp src/llama-vocab.cpp src/llama-grammar.cpp src/llama-sampling.cpp src/llama-kv-cache-unified.cpp src/llama-kv-cache-unified-iswa.cpp src/llama-memory-hybrid.cpp src/llama-memory-recurrent.cpp src/llama-model-loader.cpp src/llama-model.cpp src/llama-quant.cpp src/llama-hparams.cpp otherarch/gptj_v1.cpp otherarch/gptj_v2.cpp otherarch/gptj_v3.cpp otherarch/gpt2_v1.cpp otherarch/gpt2_v2.cpp otherarch/gpt2_v3.cpp otherarch/rwkv_v2.cpp otherarch/rwkv_v3.cpp otherarch/neox_v2.cpp otherarch/neox_v3.cpp otherarch/mpt_v3.cpp ggml/include/ggml.h ggml/include/ggml-cpu.h ggml/include/ggml-cuda.h include/llama.h otherarch/llama-util.h
679679
gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
680680
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
681681
gpttype_adapter.o: $(GPTTYPE_ADAPTER)

common/common.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,11 +714,17 @@ bool fs_validate_filename(const std::string & filename) {
714714
// disable C++17 deprecation warning for std::codecvt_utf8
715715
# pragma clang diagnostic push
716716
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
717+
#elif defined(__GNUC__)
718+
# pragma GCC diagnostic push
719+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
717720
#endif
721+
718722
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
719723

720724
#if defined(__clang__)
721725
# pragma clang diagnostic pop
726+
#elif defined(__GNUC__)
727+
# pragma GCC diagnostic pop
722728
#endif
723729

724730
filename_utf32 = converter.from_bytes(filename);

convert_hf_to_gguf.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6389,8 +6389,8 @@ def parse_args() -> argparse.Namespace:
63896389
help="model is executed on big endian machine",
63906390
)
63916391
parser.add_argument(
6392-
"model", type=Path,
6393-
help="directory containing model file",
6392+
"model", type=str,
6393+
help="directory containing model file or huggingface repository ID (if --remote)",
63946394
nargs="?",
63956395
)
63966396
parser.add_argument(
@@ -6493,18 +6493,20 @@ def main() -> None:
64936493
else:
64946494
logging.basicConfig(level=logging.INFO)
64956495

6496-
dir_model = args.model
6497-
64986496
if args.remote:
6497+
hf_repo_id = args.model
64996498
from huggingface_hub import snapshot_download
65006499
local_dir = snapshot_download(
6501-
repo_id=str(dir_model),
6500+
repo_id=hf_repo_id,
65026501
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
65036502
dir_model = Path(local_dir)
65046503
logger.info(f"Downloaded config and tokenizer to {local_dir}")
6504+
else:
6505+
hf_repo_id = None
6506+
dir_model = Path(args.model)
65056507

65066508
if not dir_model.is_dir():
6507-
logger.error(f'Error: {args.model} is not a directory')
6509+
logger.error(f'Error: {dir_model} is not a directory')
65086510
sys.exit(1)
65096511

65106512
ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6524,9 +6526,9 @@ def main() -> None:
65246526

65256527
if args.outfile is not None:
65266528
fname_out = args.outfile
6527-
elif args.remote:
6529+
elif hf_repo_id:
65286530
# if remote, use the model ID as the output file name
6529-
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6531+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
65306532
else:
65316533
fname_out = dir_model
65326534

@@ -6555,7 +6557,7 @@ def main() -> None:
65556557
split_max_tensors=args.split_max_tensors,
65566558
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
65576559
small_first_shard=args.no_tensor_first_split,
6558-
remote_hf_model_id=str(args.model) if args.remote else None)
6560+
remote_hf_model_id=hf_repo_id)
65596561

65606562
if args.vocab_only:
65616563
logger.info("Exporting model vocab...")

docs/build-s390x.md

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
> [!IMPORTANT]
2+
> This build documentation is specific only to IBM Z & LinuxONE mainframes (s390x). You can find the build documentation for other architectures: [build.md](build.md).
3+
4+
# Build llama.cpp locally (for s390x)
5+
6+
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
7+
8+
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
9+
10+
**To get the code:**
11+
12+
```bash
13+
git clone https://github.com/ggml-org/llama.cpp
14+
cd llama.cpp
15+
```
16+
17+
## CPU Build with BLAS
18+
19+
Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements.
20+
21+
```bash
22+
cmake -S . -B build \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DGGML_BLAS=ON \
25+
-DGGML_BLAS_VENDOR=OpenBLAS
26+
27+
cmake --build build --config Release -j $(nproc)
28+
```
29+
30+
**Notes**:
31+
- For faster repeated compilation, install [ccache](https://ccache.dev/)
32+
- By default, VXE/VXE2 is enabled. To disable it (not recommended):
33+
34+
```bash
35+
cmake -S . -B build \
36+
-DCMAKE_BUILD_TYPE=Release \
37+
-DGGML_BLAS=ON \
38+
-DGGML_BLAS_VENDOR=OpenBLAS \
39+
-DGGML_VXE=OFF
40+
41+
cmake --build build --config Release -j $(nproc)
42+
```
43+
44+
- For debug builds:
45+
46+
```bash
47+
cmake -S . -B build \
48+
-DCMAKE_BUILD_TYPE=Debug \
49+
-DGGML_BLAS=ON \
50+
-DGGML_BLAS_VENDOR=OpenBLAS
51+
52+
cmake --build build --config Debug -j $(nproc)
53+
```
54+
55+
- For static builds, add `-DBUILD_SHARED_LIBS=OFF`:
56+
57+
```bash
58+
cmake -S . -B build \
59+
-DCMAKE_BUILD_TYPE=Release \
60+
-DGGML_BLAS=ON \
61+
-DGGML_BLAS_VENDOR=OpenBLAS \
62+
-DBUILD_SHARED_LIBS=OFF
63+
64+
cmake --build build --config Release -j $(nproc)
65+
```
66+
67+
## Getting GGUF Models
68+
69+
All models need to be converted to Big-Endian. You can achieve this in three cases:
70+
71+
1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)**
72+
73+
You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08).
74+
75+
These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE.
76+
77+
2. **Convert safetensors model to GGUF Big-Endian directly (recommended)**
78+
79+
```bash
80+
python3 convert_hf_to_gguf.py \
81+
--outfile model-name-be.f16.gguf \
82+
--outtype f16 \
83+
--bigendian \
84+
model-directory/
85+
```
86+
87+
For example,
88+
89+
```bash
90+
python3 convert_hf_to_gguf.py \
91+
--outfile granite-3.3-2b-instruct-be.f16.gguf \
92+
--outtype f16 \
93+
--bigendian \
94+
granite-3.3-2b-instruct/
95+
```
96+
97+
3. **Convert existing GGUF Little-Endian model to Big-Endian**
98+
99+
```bash
100+
python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG
101+
```
102+
103+
For example,
104+
```bash
105+
python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG
106+
mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf
107+
```
108+
109+
**Notes:**
110+
- The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2.
111+
112+
## IBM Accelerators
113+
114+
### 1. SIMD Acceleration
115+
116+
Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation.
117+
118+
### 2. zDNN Accelerator
119+
120+
*Only available in IBM z16 or later system. No direction at the moment.*
121+
122+
### 3. Spyre Accelerator
123+
124+
*No direction at the moment.*
125+
126+
## Performance Tuning
127+
128+
### 1. Virtualization Setup
129+
130+
It is strongly recommended to use only LPAR (Type-1) virtualization to get the most performance.
131+
132+
Note: Type-2 virtualization is not supported at the moment, while you can get it running, the performance will not be the best.
133+
134+
### 2. IFL (Core) Count
135+
136+
It is recommended to allocate a minimum of 8 shared IFLs assigned to the LPAR. Increasing the IFL count past 8 shared IFLs will only improve Prompt Processing performance but not Token Generation.
137+
138+
Note: IFL count does not equate to vCPU count.
139+
140+
### 3. SMT vs NOSMT (Simultaneous Multithreading)
141+
142+
It is strongly recommended to disable SMT via the kernel boot parameters as it negatively affects performance. Please refer to your Linux distribution's guide on disabling SMT via kernel boot parameters.
143+
144+
### 4. BLAS vs NOBLAS
145+
146+
IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS.
147+
148+
## Getting Help on IBM Z & LinuxONE
149+
150+
1. **Bugs, Feature Requests**
151+
152+
Please file an issue in llama.cpp and ensure that the title contains "s390x".
153+
154+
2. **Other Questions**
155+
156+
Please reach out directly to [[email protected]](mailto:[email protected]).
157+

ggml/src/ggml-backend-reg.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@
6969
#if defined(__clang__)
7070
# pragma clang diagnostic push
7171
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
72+
#elif defined(__GNUC__)
73+
# pragma GCC diagnostic push
74+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
7275
#endif
7376

7477
namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
9194

9295
#if defined(__clang__)
9396
# pragma clang diagnostic pop
97+
#elif defined(__GNUC__)
98+
# pragma GCC diagnostic pop
9499
#endif
95100

96101
#ifdef _WIN32

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
371371
#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
372372
#endif
373373

374-
typedef signed char char8x16_t __attribute__((vector_size(16)));
374+
typedef signed char char8x16_t __attribute__((vector_size(16)));
375375
typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
376376

377377
typedef int8_t int8x16_t __attribute__((vector_size(16)));
@@ -382,10 +382,10 @@ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
382382
typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
383383
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
384384

385-
typedef float float32x4_t __attribute__((vector_size(16)));
386-
typedef double double64x2_t __attribute((vector_size(16)));
385+
typedef float float32x4_t __attribute__((vector_size(16)));
386+
typedef double double64x2_t __attribute__((vector_size(16)));
387387

388-
typedef signed long long long64x2_t __attribute((vector_size(16)));
388+
typedef signed long long long64x2_t __attribute__((vector_size(16)));
389389
typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
390390

391391
typedef struct ggml_uint8x16x2_t {

ggml/src/ggml-cpu/llamafile/sgemm.cpp

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
#define NOINLINE __attribute__((__noinline__))
6363
#endif
6464

65-
#if defined(__ARM_NEON) || defined(__AVX512F__)
65+
#if defined(__ARM_NEON) || defined(__AVX512F__) || defined(__VXE__) || defined(__VXE2__)
6666
#define VECTOR_REGISTERS 32
6767
#else
6868
#define VECTOR_REGISTERS 16
@@ -109,6 +109,12 @@ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
109109
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
110110
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
111111

112+
#if defined(__VXE__) || defined(__VXE2__)
113+
inline float32x4_t add(float32x4_t x, float32x4_t y) { return vec_add(x, y); }
114+
inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vec_sub(x, y); }
115+
inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
116+
#endif
117+
112118
#if defined(__MMA__)
113119
typedef vector unsigned char vec_t;
114120
typedef __vector_quad acc_t;
@@ -162,6 +168,13 @@ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
162168
#endif
163169
#endif
164170

171+
#if defined(__VXE__) || defined(__VXE2__)
172+
template <>
173+
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
174+
return vec_madd(a, b, c);
175+
}
176+
#endif
177+
165178
////////////////////////////////////////////////////////////////////////////////////////////////////
166179
// VECTORIZED HORIZONTAL SUM
167180

@@ -178,6 +191,13 @@ inline float hsum(float16x8_t x) {
178191
}
179192
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
180193

194+
#if defined(__VXE__) || defined(__VXE2__)
195+
inline float hsum(float32x4_t x) {
196+
float32x4_t tmp = x + vec_reve(x);
197+
return tmp[0] + tmp[1];
198+
}
199+
#endif
200+
181201
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
182202
inline float hsum(__m128 x) {
183203
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
@@ -227,6 +247,21 @@ template <> inline float32x4_t load(const ggml_fp16_t *p) {
227247
#endif // _MSC_VER
228248
#endif // __ARM_NEON
229249

250+
#if defined(__VXE__) || defined(__VXE2__)
251+
template <> inline float32x4_t load(const ggml_fp16_t * p) {
252+
float tmp[4];
253+
254+
for (int i = 0; i < 4; i++) {
255+
tmp[i] = GGML_FP16_TO_FP32(p[i]);
256+
}
257+
258+
return vec_xl(0, (const float *)(tmp));
259+
}
260+
template <> inline float32x4_t load(const float * p) {
261+
return vec_xl(0, p);
262+
}
263+
#endif
264+
230265
#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
231266
template <> inline __m128 load(const float *p) {
232267
return _mm_loadu_ps(p);
@@ -3319,6 +3354,14 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
33193354
(const float *)B, ldb,
33203355
(float *)C, ldc};
33213356
return tb.matmul(m, n);
3357+
#elif defined(__VXE__) || defined(__VXE2__)
3358+
if (n < 4)
3359+
return false;
3360+
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
3361+
k, (const float *)A, lda,
3362+
(const float *)B, ldb,
3363+
(float *)C, ldc};
3364+
return tb.matmul(m, n);
33223365
#elif defined(__MMA__)
33233366
if (k % 8)
33243367
return false;
@@ -3410,6 +3453,16 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
34103453
(float *)C, ldc};
34113454
return tb.matmul(m, n);
34123455
}
3456+
#elif defined(__VXE__) || defined(__VXE2__)
3457+
if (n < 4)
3458+
return false;
3459+
if (Btype == GGML_TYPE_F16) {
3460+
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
3461+
k, (const ggml_fp16_t *)A, lda,
3462+
(const ggml_fp16_t *)B, ldb,
3463+
(float *)C, ldc};
3464+
return tb.matmul(m, n);
3465+
}
34133466
#endif
34143467
return false;
34153468
}

ggml/src/ggml-cpu/llamafile/sgemm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
#pragma once
22
#include <stdint.h>
33
#include <stdbool.h>
4+
5+
#if defined(__VXE__) || defined(__VXE2__)
6+
#include <vecintrin.h>
7+
#endif
8+
49
#ifdef __cplusplus
510
extern "C" {
611
#endif

0 commit comments

Comments
 (0)