Skip to content

Commit 4941bf0

Browse files
committed
merge mainstream
2 parents 69b3d9a + 7db35a7 commit 4941bf0

File tree

11 files changed

+205
-14
lines changed

11 files changed

+205
-14
lines changed

.devops/s390x.Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ RUN --mount=type=cache,target=/root/.ccache \
2424
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
2525
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
2626
-DLLAMA_BUILD_TESTS=OFF \
27-
-DGGML_BACKEND_DL=OFF \
2827
-DGGML_NATIVE=OFF \
28+
-DGGML_BACKEND_DL=ON \
29+
-DGGML_CPU_ALL_VARIANTS=ON \
2930
-DGGML_BLAS=ON \
3031
-DGGML_BLAS_VENDOR=OpenBLAS && \
3132
cmake --build build --config Release -j $(nproc) && \
@@ -103,6 +104,7 @@ FROM base AS light
103104
WORKDIR /llama.cpp/bin
104105

105106
# Copy llama.cpp binaries and libraries
107+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
106108
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
107109

108110
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -116,6 +118,7 @@ ENV LLAMA_ARG_HOST=0.0.0.0
116118
WORKDIR /llama.cpp/bin
117119

118120
# Copy llama.cpp binaries and libraries
121+
COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
119122
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
120123

121124
EXPOSE 8080

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,8 @@ jobs:
134134
include:
135135
- build: 'x64'
136136
os: ubuntu-22.04
137-
#- build: 's390x-z15' # z15 because our CI runners are on z15
138-
# os: ubuntu-22.04-s390x
137+
#- build: 's390x'
138+
# os: ubuntu-24.04-s390x
139139
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
140140
# - build: 'arm64'
141141
# os: ubuntu-22.04-arm

docs/docker.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
## Images
88
We have three Docker images available for this project:
99

10-
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
11-
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
12-
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
10+
1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
11+
2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
12+
3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
1313

1414
Additionally, there the following images, similar to the above:
1515

ggml/src/CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,10 @@ function(ggml_add_cpu_backend_variant tag_name)
308308
set(GGML_INTERNAL_${feat} ON)
309309
endforeach()
310310
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
311+
foreach (feat VXE2 NNPA)
312+
set(GGML_INTERNAL_${feat} OFF)
313+
endforeach()
314+
311315
foreach (feat ${ARGN})
312316
set(GGML_INTERNAL_${feat} ON)
313317
endforeach()
@@ -377,9 +381,8 @@ if (GGML_CPU_ALL_VARIANTS)
377381
endif()
378382
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
379383
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
380-
ggml_add_cpu_backend_variant(s390x_z15 Z15 VXE)
381-
# ggml_add_cpu_backend_variant(s390x_z16 Z16 VXE)
382-
# ggml_add_cpu_backend_variant(s390x_z17 Z17 VXE)
384+
ggml_add_cpu_backend_variant(z15 Z15 VXE2)
385+
ggml_add_cpu_backend_variant(z16 Z16 VXE2 NNPA)
383386
else()
384387
message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
385388
endif()

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -504,11 +504,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
504504
endforeach()
505505
endif()
506506

507-
if (GGML_VXE OR GGML_INTERNAL_VXE)
508-
message(STATUS "VX/VXE/VXE2 enabled")
507+
if (GGML_VXE OR GGML_INTERNAL_VXE2)
508+
message(STATUS "VXE2 enabled")
509509
list(APPEND ARCH_FLAGS -mvx -mzvector)
510-
list(APPEND ARCH_DEFINITIONS GGML_VXE)
510+
list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
511511
endif()
512+
513+
if (GGML_INTERNAL_NNPA)
514+
message(STATUS "NNPA enabled")
515+
list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
516+
endif()
517+
518+
ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
512519
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
513520
message(STATUS "Wasm detected")
514521
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#include "ggml-backend-impl.h"
2+
3+
#if defined(__s390x__)
4+
#include <sys/auxv.h>
5+
6+
// find hwcap bits in asm/elf.h
7+
#ifndef HWCAP_VXRS_EXT2
8+
#define HWCAP_VXRS_EXT2 (1 << 15)
9+
#endif
10+
11+
#ifndef HWCAP_NNPA
12+
#define HWCAP_NNPA (1 << 20)
13+
#endif
14+
15+
struct s390x_features {
16+
bool has_vxe2 = false;
17+
bool has_nnpa = false;
18+
19+
s390x_features() {
20+
uint32_t hwcap = getauxval(AT_HWCAP);
21+
// NOTE: use hwcap2 with DFLT for z17 and later
22+
// uint32_t hwcap2 = getauxval(AT_HWCAP2);
23+
24+
has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25+
has_nnpa = !!(hwcap & HWCAP_NNPA);
26+
}
27+
};
28+
29+
static int ggml_backend_cpu_s390x_score() {
30+
int score = 1;
31+
s390x_features sf;
32+
33+
// IBM z15 / LinuxONE 3
34+
#ifdef GGML_USE_VXE2
35+
if (!sf.has_vxe2) { return 0; }
36+
score += 1 << 1;
37+
#endif
38+
39+
// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40+
#ifdef GGML_USE_NNPA
41+
if (!sf.has_nnpa) { return 0; }
42+
score += 1 << 2;
43+
#endif
44+
45+
return score;
46+
}
47+
48+
GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49+
50+
#endif // __s390x__

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2499,6 +2499,18 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
24992499
case GGML_UNARY_OP_XIELU:
25002500
ggml_cuda_op_xielu(ctx, dst);
25012501
break;
2502+
case GGML_UNARY_OP_FLOOR:
2503+
ggml_cuda_op_floor(ctx, dst);
2504+
break;
2505+
case GGML_UNARY_OP_CEIL:
2506+
ggml_cuda_op_ceil(ctx, dst);
2507+
break;
2508+
case GGML_UNARY_OP_ROUND:
2509+
ggml_cuda_op_round(ctx, dst);
2510+
break;
2511+
case GGML_UNARY_OP_TRUNC:
2512+
ggml_cuda_op_trunc(ctx, dst);
2513+
break;
25022514
default:
25032515
return false;
25042516
}
@@ -3769,6 +3781,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
37693781
case GGML_UNARY_OP_TANH:
37703782
case GGML_UNARY_OP_EXP:
37713783
case GGML_UNARY_OP_ELU:
3784+
case GGML_UNARY_OP_FLOOR:
3785+
case GGML_UNARY_OP_CEIL:
3786+
case GGML_UNARY_OP_ROUND:
3787+
case GGML_UNARY_OP_TRUNC:
37723788
return ggml_is_contiguous(op->src[0]);
37733789
default:
37743790
return false;

ggml/src/ggml-cuda/unary.cu

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ static __device__ __forceinline__ float op_elu(float x) {
8585
return (x > 0.f) ? x : expm1f(x);
8686
}
8787

88+
static __device__ __forceinline__ float op_floor(float x) {
89+
return floorf(x);
90+
}
91+
92+
static __device__ __forceinline__ float op_ceil(float x) {
93+
return ceilf(x);
94+
}
95+
96+
static __device__ __forceinline__ float op_round(float x) {
97+
return round(x);
98+
}
99+
100+
static __device__ __forceinline__ float op_trunc(float x) {
101+
return trunc(x);
102+
}
103+
88104
template <float (*op)(float), typename T>
89105
static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
90106
const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -201,6 +217,22 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
201217
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
202218
ggml_cuda_op_unary<op_elu>(ctx, dst);
203219
}
220+
221+
void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
222+
ggml_cuda_op_unary<op_floor>(ctx, dst);
223+
}
224+
225+
void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
226+
ggml_cuda_op_unary<op_ceil>(ctx, dst);
227+
}
228+
229+
void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
230+
ggml_cuda_op_unary<op_round>(ctx, dst);
231+
}
232+
233+
void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
234+
ggml_cuda_op_unary<op_trunc>(ctx, dst);
235+
}
204236
/* gated ops */
205237

206238
template <float (*op)(float), typename T>

ggml/src/ggml-cuda/unary.cuh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,14 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6363

6464
void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6565

66+
void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
67+
68+
void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
69+
70+
void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
71+
72+
void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
73+
6674
void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
6775

6876
void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

scripts/bench-models.sh

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
3+
RESULTS="bench-models-results.txt"
4+
: > "$RESULTS"
5+
6+
ARGS_BB="-c 270336 -npp 512,4096,8192 -npl 1,2,4,8,16,32 -ntg 32"
7+
ARGS_B="-d 0,4096,8192,16384,32768 -p 2048 -n 32"
8+
9+
QUICK=0
10+
while (( "$#" )); do
11+
case "$1" in
12+
--quick) QUICK=1; shift ;;
13+
*) shift ;;
14+
esac
15+
done
16+
17+
if (( QUICK )); then
18+
ARGS_BB="-c 20480 -npp 512,4096 -npl 1,2,4 -ntg 32"
19+
ARGS_B="-d 0 -p 2048 -n 32"
20+
fi
21+
22+
run_model() {
23+
local HFR=$1
24+
local HFF=$2
25+
26+
printf "## ${HFR}\n" | tee -a "$RESULTS"
27+
printf "\n" | tee -a "$RESULTS"
28+
printf "Model: https://huggingface.co/${HFR}\n" | tee -a "$RESULTS"
29+
printf "\n" | tee -a "$RESULTS"
30+
31+
printf -- "- \`llama-batched-bench\`\n" | tee -a "$RESULTS"
32+
printf "\n" | tee -a "$RESULTS"
33+
34+
./bin/llama-batched-bench \
35+
-hfr "${HFR}" -hff "${HFF}" \
36+
-m "${HFF}" -fa 1 -ub 2048 --no-mmap \
37+
${ARGS_BB} | tee -a "$RESULTS"
38+
39+
printf "\n" | tee -a "$RESULTS"
40+
41+
printf -- "- \`llama-bench\`\n" | tee -a "$RESULTS"
42+
printf "\n" | tee -a "$RESULTS"
43+
44+
./bin/llama-bench \
45+
-m "${HFF}" -fa 1 -ub 2048 -mmp 0 \
46+
${ARGS_B} | tee -a "$RESULTS"
47+
48+
printf "\n" | tee -a "$RESULTS"
49+
50+
printf "\n"
51+
}
52+
53+
run_model "ggml-org/gpt-oss-20b-GGUF" "gpt-oss-20b-mxfp4.gguf"
54+
run_model "ggml-org/gpt-oss-120b-GGUF" "gpt-oss-120b-mxfp4-00001-of-00003.gguf"
55+
run_model "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" "qwen3-coder-30b-a3b-instruct-q8_0.gguf"
56+
run_model "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" "qwen2.5-coder-7b-q8_0.gguf"
57+
run_model "ggml-org/gemma-3-4b-it-qat-GGUF" "gemma-3-4b-it-qat-Q4_0.gguf"
58+
59+
if [[ -f models-extra.txt ]]; then
60+
while read -r HFR HFF; do
61+
[[ -z "$HFR" ]] && continue
62+
run_model "$HFR" "$HFF"
63+
done < models-extra.txt
64+
fi
65+
66+
printf "\n=====================================\n"
67+
printf "\n"
68+
69+
cat "$RESULTS"
70+
71+
printf "\n"
72+
printf "Done! Results are written to $RESULTS\n"
73+
printf "\n"
74+

0 commit comments

Comments
 (0)