Skip to content

Commit bb46646

Browse files
authored
Merge branch 'ggml-org:master' into ci
2 parents a9a6809 + f505bd8 commit bb46646

38 files changed

+847
-475
lines changed

.clang-tidy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Checks: >
1717
clang-analyzer-*,
1818
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
1919
performance-*,
20+
-performance-enum-size,
2021
portability-*,
2122
-portability-simd-intrinsics,
2223
misc-*,

.devops/s390x.Dockerfile

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
ARG GCC_VERSION=15.2.0
2+
ARG UBUNTU_VERSION=24.04
3+
4+
### Build Llama.cpp stage
5+
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
6+
7+
RUN --mount=type=cache,target=/var/cache/apt \
8+
--mount=type=cache,target=/var/lib/apt/lists \
9+
apt update -y && \
10+
apt upgrade -y && \
11+
apt install -y --no-install-recommends \
12+
git cmake ccache ninja-build \
13+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
14+
libopenblas-dev libcurl4-openssl-dev && \
15+
rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /app
18+
COPY . .
19+
20+
RUN --mount=type=cache,target=/root/.ccache \
21+
--mount=type=cache,target=/app/build \
22+
cmake -S . -B build -G Ninja \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
25+
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
26+
-DLLAMA_BUILD_TESTS=OFF \
27+
-DGGML_BACKEND_DL=OFF \
28+
-DGGML_NATIVE=OFF \
29+
-DGGML_BLAS=ON \
30+
-DGGML_BLAS_VENDOR=OpenBLAS && \
31+
cmake --build build --config Release -j $(nproc) && \
32+
cmake --install build --prefix /opt/llama.cpp
33+
34+
COPY *.py /opt/llama.cpp/bin
35+
COPY .devops/tools.sh /opt/llama.cpp/bin
36+
37+
COPY gguf-py /opt/llama.cpp/gguf-py
38+
COPY requirements.txt /opt/llama.cpp/gguf-py
39+
COPY requirements /opt/llama.cpp/gguf-py/requirements
40+
41+
42+
### Collect all llama.cpp binaries, libraries and distro libraries
43+
FROM --platform=linux/s390x scratch AS collector
44+
45+
# Copy llama.cpp binaries and libraries
46+
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
47+
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
48+
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
49+
50+
51+
### Base image
52+
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
53+
54+
RUN --mount=type=cache,target=/var/cache/apt \
55+
--mount=type=cache,target=/var/lib/apt/lists \
56+
apt update -y && \
57+
apt install -y --no-install-recommends \
58+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
59+
curl libgomp1 libopenblas-dev && \
60+
apt autoremove -y && \
61+
apt clean -y && \
62+
rm -rf /tmp/* /var/tmp/* && \
63+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
64+
find /var/cache -type f -delete
65+
66+
# Copy llama.cpp libraries
67+
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
68+
69+
70+
### Full
71+
FROM --platform=linux/s390x base AS full
72+
73+
ENV PATH="/root/.cargo/bin:${PATH}"
74+
WORKDIR /app
75+
76+
RUN --mount=type=cache,target=/var/cache/apt \
77+
--mount=type=cache,target=/var/lib/apt/lists \
78+
apt update -y && \
79+
apt install -y \
80+
git cmake libjpeg-dev \
81+
python3 python3-pip python3-dev && \
82+
apt autoremove -y && \
83+
apt clean -y && \
84+
rm -rf /tmp/* /var/tmp/* && \
85+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
86+
find /var/cache -type f -delete
87+
88+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
89+
90+
COPY --from=collector /llama.cpp/bin /app
91+
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
92+
93+
RUN pip install --no-cache-dir --break-system-packages \
94+
-r /app/gguf-py/requirements.txt
95+
96+
ENTRYPOINT [ "/app/tools.sh" ]
97+
98+
99+
### CLI Only
100+
FROM --platform=linux/s390x base AS light
101+
102+
WORKDIR /llama.cpp/bin
103+
104+
# Copy llama.cpp binaries and libraries
105+
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
106+
107+
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
108+
109+
110+
### Server
111+
FROM --platform=linux/s390x base AS server
112+
113+
ENV LLAMA_ARG_HOST=0.0.0.0
114+
115+
WORKDIR /llama.cpp/bin
116+
117+
# Copy llama.cpp binaries and libraries
118+
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
119+
120+
EXPOSE 8080
121+
122+
ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

.github/workflows/build.yml

Lines changed: 42 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,8 +1350,8 @@ jobs:
13501350
run: |
13511351
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
13521352
1353-
ggml-ci-x64-nvidia-v100-cuda:
1354-
runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
1353+
ggml-ci-x64-nvidia-cuda:
1354+
runs-on: [self-hosted, Linux, X64, NVIDIA]
13551355

13561356
steps:
13571357
- name: Clone
@@ -1364,8 +1364,8 @@ jobs:
13641364
nvidia-smi
13651365
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
13661366
1367-
ggml-ci-x64-nvidia-v100-vulkan:
1368-
runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
1367+
ggml-ci-x64-nvidia-vulkan-cm:
1368+
runs-on: [self-hosted, Linux, X64, NVIDIA]
13691369

13701370
steps:
13711371
- name: Clone
@@ -1375,25 +1375,11 @@ jobs:
13751375
- name: Test
13761376
id: ggml-ci
13771377
run: |
1378-
vulkaninfo
1379-
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1380-
1381-
ggml-ci-x64-nvidia-t4-cuda:
1382-
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
1383-
1384-
steps:
1385-
- name: Clone
1386-
id: checkout
1387-
uses: actions/checkout@v4
1388-
1389-
- name: Test
1390-
id: ggml-ci
1391-
run: |
1392-
nvidia-smi
1393-
GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1378+
vulkaninfo --summary
1379+
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
13941380
1395-
ggml-ci-x64-nvidia-t4-vulkan:
1396-
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
1381+
ggml-ci-x64-nvidia-vulkan-cm2:
1382+
runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
13971383

13981384
steps:
13991385
- name: Clone
@@ -1403,23 +1389,9 @@ jobs:
14031389
- name: Test
14041390
id: ggml-ci
14051391
run: |
1406-
vulkaninfo
1392+
vulkaninfo --summary
14071393
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
14081394
1409-
ggml-ci-x64-nvidia-t4-vulkan-coopmat1:
1410-
runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
1411-
1412-
steps:
1413-
- name: Clone
1414-
id: checkout
1415-
uses: actions/checkout@v4
1416-
1417-
- name: Test
1418-
id: ggml-ci
1419-
run: |
1420-
vulkaninfo
1421-
GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1422-
14231395
ggml-ci-x64-cpu-amx:
14241396
runs-on: [self-hosted, Linux, X64, CPU, AMX]
14251397

@@ -1433,21 +1405,36 @@ jobs:
14331405
run: |
14341406
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
14351407
1436-
ggml-ci-x64-amd-v710-vulkan:
1437-
runs-on: [self-hosted, Linux, X64, AMD, V710]
1438-
1439-
steps:
1440-
- name: Clone
1441-
id: checkout
1442-
uses: actions/checkout@v4
1443-
1444-
- name: Test
1445-
id: ggml-ci
1446-
run: |
1447-
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1408+
# ggml-ci-x64-amd-vulkan:
1409+
# runs-on: [self-hosted, Linux, X64, AMD]
1410+
#
1411+
# steps:
1412+
# - name: Clone
1413+
# id: checkout
1414+
# uses: actions/checkout@v4
1415+
#
1416+
# - name: Test
1417+
# id: ggml-ci
1418+
# run: |
1419+
# vulkaninfo --summary
1420+
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1421+
#
1422+
# ggml-ci-x64-amd-rocm:
1423+
# runs-on: [self-hosted, Linux, X64, AMD]
1424+
#
1425+
# steps:
1426+
# - name: Clone
1427+
# id: checkout
1428+
# uses: actions/checkout@v4
1429+
#
1430+
# - name: Test
1431+
# id: ggml-ci
1432+
# run: |
1433+
# amd-smi static
1434+
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
14481435

1449-
ggml-ci-x64-amd-v710-rocm:
1450-
runs-on: [self-hosted, Linux, X64, AMD, V710]
1436+
ggml-ci-mac-metal:
1437+
runs-on: [self-hosted, macOS, ARM64]
14511438

14521439
steps:
14531440
- name: Clone
@@ -1457,9 +1444,9 @@ jobs:
14571444
- name: Test
14581445
id: ggml-ci
14591446
run: |
1460-
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
1447+
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
14611448
1462-
ggml-ci-mac-metal:
1449+
ggml-ci-mac-vulkan:
14631450
runs-on: [self-hosted, macOS, ARM64]
14641451

14651452
steps:
@@ -1470,18 +1457,5 @@ jobs:
14701457
- name: Test
14711458
id: ggml-ci
14721459
run: |
1473-
GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
1474-
1475-
# TODO: install vulkan drivers
1476-
# ggml-ci-mac-vulkan:
1477-
# runs-on: [self-hosted, macOS, ARM64]
1478-
#
1479-
# steps:
1480-
# - name: Clone
1481-
# id: checkout
1482-
# uses: actions/checkout@v4
1483-
#
1484-
# - name: Test
1485-
# id: ggml-ci
1486-
# run: |
1487-
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
1460+
vulkaninfo --summary
1461+
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

.github/workflows/docker.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ jobs:
4444
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
4545
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
4646
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
47+
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false }
4748
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
4849
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
4950
steps:

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
/examples/llama.vim @ggerganov
3535
/examples/lookahead/ @ggerganov
3636
/examples/lookup/ @JohannesGaessler
37+
/examples/model-conversion/ @danbev
3738
/examples/parallel/ @ggerganov
3839
/examples/passkey/ @ggerganov
3940
/examples/retrieval/ @ggerganov

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
274274
| [Vulkan](docs/build.md#vulkan) | GPU |
275275
| [CANN](docs/build.md#cann) | Ascend NPU |
276276
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
277+
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
277278
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
278279
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
279280

ci/run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ fi
9292

9393
if [ ! -z ${GG_BUILD_VULKAN} ]; then
9494
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
95+
96+
# if on Mac, disable METAL
97+
if [[ "$OSTYPE" == "darwin"* ]]; then
98+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
99+
fi
100+
95101
fi
96102

97103
if [ ! -z ${GG_BUILD_WEBGPU} ]; then

convert_hf_to_gguf.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7656,6 +7656,21 @@ def __init__(self, *args, **kwargs):
76567656
if i not in self._attn_layers
76577657
]
76587658

7659+
# There are some models in this family that are non-hybrid, but keep the
7660+
# same parent class by setting all layers to "attention." If this is the
7661+
# case, the model architecture needs to be updated to a standard
7662+
# "granite" or "granitemoe" model
7663+
if not self._ssm_layers:
7664+
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
7665+
new_arch = (
7666+
gguf.MODEL_ARCH.GRANITE_MOE
7667+
if has_experts else
7668+
gguf.MODEL_ARCH.GRANITE
7669+
)
7670+
self.model_arch = new_arch
7671+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
7672+
self.gguf_writer.add_architecture()
7673+
76597674
# n_group and d_inner are used during reshape_tensors for mamba2
76607675
# NOTE: Explicitly include hparam prefix prefix for d_model to
76617676
# disambiguate with top-level head_dim
@@ -7740,8 +7755,11 @@ def set_gguf_parameters(self):
77407755
self.gguf_writer.add_rope_dimension_count(rope_dim)
77417756
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
77427757

7743-
## If Bamba, use rope, otherwise don't
7744-
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
7758+
## If Bamba or non-hybrid, use rope, otherwise don't
7759+
use_rope = (
7760+
"BambaForCausalLM" in self.hparams["architectures"]
7761+
or not self._ssm_layers
7762+
)
77457763
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
77467764
if not use_rope:
77477765
self.gguf_writer.add_context_length(2**20)

0 commit comments

Comments
 (0)