Skip to content

Commit c8139c2

Browse files
Merge pull request #262 from menloresearch/update-dev-from-master-2025-09-26-00-33
Sync master with upstream release b6586
2 parents 00a21a5 + 835b2b9 commit c8139c2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+5877
-2429
lines changed

.devops/s390x.Dockerfile

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@ ARG GCC_VERSION=15.2.0
22
ARG UBUNTU_VERSION=24.04
33

44
### Build Llama.cpp stage
5-
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
5+
FROM gcc:${GCC_VERSION} AS build
66

7-
RUN --mount=type=cache,target=/var/cache/apt \
8-
--mount=type=cache,target=/var/lib/apt/lists \
7+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
8+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
99
apt update -y && \
1010
apt upgrade -y && \
1111
apt install -y --no-install-recommends \
@@ -40,7 +40,7 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
4040

4141

4242
### Collect all llama.cpp binaries, libraries and distro libraries
43-
FROM --platform=linux/s390x scratch AS collector
43+
FROM scratch AS collector
4444

4545
# Copy llama.cpp binaries and libraries
4646
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
@@ -49,13 +49,14 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
4949

5050

5151
### Base image
52-
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
52+
FROM ubuntu:${UBUNTU_VERSION} AS base
5353

54-
RUN --mount=type=cache,target=/var/cache/apt \
55-
--mount=type=cache,target=/var/lib/apt/lists \
54+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
55+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
5656
apt update -y && \
5757
apt install -y --no-install-recommends \
5858
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
59+
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
5960
curl libgomp1 libopenblas-dev && \
6061
apt autoremove -y && \
6162
apt clean -y && \
@@ -68,13 +69,13 @@ COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
6869

6970

7071
### Full
71-
FROM --platform=linux/s390x base AS full
72+
FROM base AS full
7273

7374
ENV PATH="/root/.cargo/bin:${PATH}"
7475
WORKDIR /app
7576

76-
RUN --mount=type=cache,target=/var/cache/apt \
77-
--mount=type=cache,target=/var/lib/apt/lists \
77+
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
78+
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
7879
apt update -y && \
7980
apt install -y \
8081
git cmake libjpeg-dev \
@@ -97,7 +98,7 @@ ENTRYPOINT [ "/app/tools.sh" ]
9798

9899

99100
### CLI Only
100-
FROM --platform=linux/s390x base AS light
101+
FROM base AS light
101102

102103
WORKDIR /llama.cpp/bin
103104

@@ -108,7 +109,7 @@ ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
108109

109110

110111
### Server
111-
FROM --platform=linux/s390x base AS server
112+
FROM base AS server
112113

113114
ENV LLAMA_ARG_HOST=0.0.0.0
114115

.github/workflows/build.yml

Lines changed: 81 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1251,56 +1251,129 @@ jobs:
12511251
# TODO: simplify the following workflows using a matrix
12521252
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
12531253
ggml-ci-x64-cpu-low-perf:
1254-
runs-on: [self-hosted, Linux, X64, CPU, low-perf]
1254+
runs-on: ubuntu-22.04
12551255

12561256
steps:
12571257
- name: Clone
12581258
id: checkout
12591259
uses: actions/checkout@v4
12601260

1261+
- name: ccache
1262+
uses: ggml-org/[email protected]
1263+
with:
1264+
key: ggml-ci-x64-cpu-low-perf
1265+
evict-old-files: 1d
1266+
1267+
- name: Dependencies
1268+
id: depends
1269+
run: |
1270+
sudo apt-get update
1271+
sudo apt-get install build-essential libcurl4-openssl-dev
1272+
12611273
- name: Test
12621274
id: ggml-ci
12631275
run: |
1264-
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1276+
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
12651277
12661278
ggml-ci-arm64-cpu-low-perf:
1267-
runs-on: [self-hosted, Linux, ARM64, CPU, low-perf]
1279+
runs-on: ubuntu-22.04-arm
12681280

12691281
steps:
12701282
- name: Clone
12711283
id: checkout
12721284
uses: actions/checkout@v4
12731285

1286+
- name: ccache
1287+
uses: ggml-org/[email protected]
1288+
with:
1289+
key: ggml-ci-arm64-cpu-low-perf
1290+
evict-old-files: 1d
1291+
1292+
- name: Dependencies
1293+
id: depends
1294+
run: |
1295+
sudo apt-get update
1296+
sudo apt-get install build-essential libcurl4-openssl-dev
1297+
12741298
- name: Test
12751299
id: ggml-ci
12761300
run: |
1277-
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1301+
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
12781302
12791303
ggml-ci-x64-cpu-high-perf:
1280-
runs-on: [self-hosted, Linux, X64, CPU, high-perf]
1304+
runs-on: ubuntu-22.04
12811305

12821306
steps:
12831307
- name: Clone
12841308
id: checkout
12851309
uses: actions/checkout@v4
12861310

1311+
- name: ccache
1312+
uses: ggml-org/[email protected]
1313+
with:
1314+
key: ggml-ci-x64-cpu-high-perf
1315+
evict-old-files: 1d
1316+
1317+
- name: Dependencies
1318+
id: depends
1319+
run: |
1320+
sudo apt-get update
1321+
sudo apt-get install build-essential libcurl4-openssl-dev
1322+
12871323
- name: Test
12881324
id: ggml-ci
12891325
run: |
1290-
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1326+
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
12911327
12921328
ggml-ci-arm64-cpu-high-perf:
1293-
runs-on: [self-hosted, Linux, ARM64, CPU, high-perf]
1329+
runs-on: ubuntu-22.04-arm
12941330

12951331
steps:
12961332
- name: Clone
12971333
id: checkout
12981334
uses: actions/checkout@v4
12991335

1336+
- name: ccache
1337+
uses: ggml-org/[email protected]
1338+
with:
1339+
key: ggml-ci-arm64-cpu-high-perf
1340+
evict-old-files: 1d
1341+
1342+
- name: Dependencies
1343+
id: depends
1344+
run: |
1345+
sudo apt-get update
1346+
sudo apt-get install build-essential libcurl4-openssl-dev
1347+
1348+
- name: Test
1349+
id: ggml-ci
1350+
run: |
1351+
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
1352+
1353+
ggml-ci-arm64-cpu-high-perf-sve:
1354+
runs-on: ubuntu-22.04-arm
1355+
1356+
steps:
1357+
- name: Clone
1358+
id: checkout
1359+
uses: actions/checkout@v4
1360+
1361+
- name: ccache
1362+
uses: ggml-org/[email protected]
1363+
with:
1364+
key: ggml-ci-arm64-cpu-high-perf-sve
1365+
evict-old-files: 1d
1366+
1367+
- name: Dependencies
1368+
id: depends
1369+
run: |
1370+
sudo apt-get update
1371+
sudo apt-get install build-essential libcurl4-openssl-dev
1372+
13001373
- name: Test
13011374
id: ggml-ci
13021375
run: |
1303-
GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1376+
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
13041377
13051378
ggml-ci-x64-nvidia-cuda:
13061379
runs-on: [self-hosted, Linux, X64, NVIDIA]

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
/ggml/src/ggml-quants.* @ggerganov
6464
/ggml/src/ggml-threading.* @ggerganov @slaren
6565
/ggml/src/ggml-vulkan/ @0cc4m
66-
/ggml/src/ggml-zdnn/ @taronaeo
66+
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
6767
/ggml/src/ggml.c @ggerganov @slaren
6868
/ggml/src/ggml.cpp @ggerganov @slaren
6969
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
2525
- Squash-merge PRs
2626
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
2727
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
28-
- Let other maintainers, merge their own PRs
28+
- Let other maintainers merge their own PRs
2929
- When merging a PR, make sure you have a good understanding of the changes
3030
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
3131

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
178178
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
179179
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
180180
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
181+
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
181182
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
182183
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
183184
- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)

ci/run.sh

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
109109
MUSA_ARCH=${MUSA_ARCH:-21}
110110
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
111111
fi
112+
113+
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
114+
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
115+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
116+
fi
112117
## helpers
113118

114119
# download a file if it does not exist or if it is outdated
@@ -345,16 +350,16 @@ function gg_run_qwen3_0_6b {
345350

346351
wiki_test="${path_wiki}/wiki.test.raw"
347352

348-
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0
349-
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0
350-
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1
351-
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0
352-
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1
353-
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k
354-
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k
355-
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k
356-
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k
357-
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k
353+
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
354+
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
355+
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
356+
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
357+
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
358+
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
359+
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
360+
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
361+
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
362+
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
358363

359364
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
360365
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
@@ -427,7 +432,7 @@ function gg_run_qwen3_0_6b {
427432
function gg_sum_qwen3_0_6b {
428433
gg_printf '### %s\n\n' "${ci}"
429434

430-
gg_printf 'Pythia 2.8B:\n'
435+
gg_printf 'Qwen3 0.6B:\n'
431436
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
432437
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
433438
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"

common/common.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -961,15 +961,13 @@ struct common_init_result common_init_from_params(common_params & params) {
961961

962962
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
963963
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
964+
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
964965

965-
if (!has_eos && !has_sep) {
966-
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
966+
if (!has_eos && !has_sep && !has_rerank_prompt) {
967+
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
967968
ok = false;
968969
} else if (!has_eos) {
969970
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
970-
} else if (!has_sep) {
971-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
972-
ok = false;
973971
}
974972

975973
if (!ok) {

common/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
738738
// MoE utils
739739
//
740740

741-
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
741+
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
742742

743743
static std::string llm_ffn_exps_block_regex(int idx) {
744744
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);

0 commit comments

Comments
 (0)