ggml-org
diff --git a/‎.clang-tidy‎
Lines changed: 1 addition & 0 deletions b/‎.clang-tidy‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.devops/s390x.Dockerfile‎
Lines changed: 122 additions & 0 deletions b/‎.devops/s390x.Dockerfile‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 42 additions & 68 deletions b/‎.github/workflows/build.yml‎
Lines changed: 42 additions & 68 deletions
diff --git a/‎.github/workflows/docker.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/docker.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 1 addition & 0 deletions b/‎CODEOWNERS‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/run.sh‎
Lines changed: 6 additions & 0 deletions b/‎ci/run.sh‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 20 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 20 additions & 2 deletions
@@ -17,6 +17,7 @@ Checks: >
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,
+    -performance-enum-size,
     portability-*,
     -portability-simd-intrinsics,
     misc-*,
 
@@ -0,0 +1,122 @@
+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+
+### Build Llama.cpp stage
+FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt upgrade -y && \
+    apt install -y --no-install-recommends \
+        git cmake ccache ninja-build \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        libopenblas-dev libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . .
+
+RUN --mount=type=cache,target=/root/.ccache \
+    --mount=type=cache,target=/app/build \
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_BACKEND_DL=OFF \
+        -DGGML_NATIVE=OFF \
+        -DGGML_BLAS=ON \
+        -DGGML_BLAS_VENDOR=OpenBLAS && \
+    cmake --build build --config Release -j $(nproc) && \
+    cmake --install build --prefix /opt/llama.cpp
+
+COPY *.py             /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+
+COPY gguf-py          /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements     /opt/llama.cpp/gguf-py/requirements
+
+
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM --platform=linux/s390x scratch AS collector
+
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+
+
+### Base image
+FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y --no-install-recommends \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        curl libgomp1 libopenblas-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+
+
+### Full
+FROM --platform=linux/s390x base AS full
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y \
+        git cmake libjpeg-dev \
+        python3 python3-pip python3-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+
+RUN pip install --no-cache-dir --break-system-packages \
+        -r /app/gguf-py/requirements.txt
+
+ENTRYPOINT [ "/app/tools.sh" ]
+
+
+### CLI Only
+FROM --platform=linux/s390x base AS light
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+
+
+### Server
+FROM --platform=linux/s390x base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+
+EXPOSE 8080
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
@@ -1350,8 +1350,8 @@ jobs:
         run: |
           LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
-  ggml-ci-x64-nvidia-v100-cuda:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
+  ggml-ci-x64-nvidia-cuda:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]
 
     steps:
       - name: Clone
@@ -1364,8 +1364,8 @@ jobs:
           nvidia-smi
           GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  ggml-ci-x64-nvidia-v100-vulkan:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, V100]
+  ggml-ci-x64-nvidia-vulkan-cm:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]
 
     steps:
       - name: Clone
@@ -1375,25 +1375,11 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          vulkaninfo
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-t4-cuda:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  ggml-ci-x64-nvidia-t4-vulkan:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
+  ggml-ci-x64-nvidia-vulkan-cm2:
+    runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
 
     steps:
       - name: Clone
@@ -1403,23 +1389,9 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          vulkaninfo
+          vulkaninfo --summary
           GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  ggml-ci-x64-nvidia-t4-vulkan-coopmat1:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, T4]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
   ggml-ci-x64-cpu-amx:
     runs-on: [self-hosted, Linux, X64, CPU, AMX]
 
@@ -1433,21 +1405,36 @@ jobs:
         run: |
           bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  ggml-ci-x64-amd-v710-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD, V710]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+#  ggml-ci-x64-amd-vulkan:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          vulkaninfo --summary
+#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+#
+#  ggml-ci-x64-amd-rocm:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          amd-smi static
+#          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
 
-  ggml-ci-x64-amd-v710-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD, V710]
+  ggml-ci-mac-metal:
+    runs-on: [self-hosted, macOS, ARM64]
 
     steps:
       - name: Clone
@@ -1457,9 +1444,9 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
 
-  ggml-ci-mac-metal:
+  ggml-ci-mac-vulkan:
     runs-on: [self-hosted, macOS, ARM64]
 
     steps:
@@ -1470,18 +1457,5 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-# TODO: install vulkan drivers
-#  ggml-ci-mac-vulkan:
-#    runs-on: [self-hosted, macOS, ARM64]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v4
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
@@ -44,6 +44,7 @@ jobs:
           - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
           - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
           - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false }
           # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
           #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
     steps:
 
@@ -34,6 +34,7 @@
 /examples/llama.vim                     @ggerganov
 /examples/lookahead/                    @ggerganov
 /examples/lookup/                       @JohannesGaessler
+/examples/model-conversion/             @danbev
 /examples/parallel/                     @ggerganov
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
 
@@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 
 
@@ -92,6 +92,12 @@ fi
 
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+
+    # if on Mac, disable METAL
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    fi
+
 fi
 
 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
 
@@ -7656,6 +7656,21 @@ def __init__(self, *args, **kwargs):
             if i not in self._attn_layers
         ]
 
+        # There are some models in this family that are non-hybrid, but keep the
+        # same parent class by setting all layers to "attention." If this is the
+        # case, the model architecture needs to be updated to a standard
+        # "granite" or "granitemoe" model
+        if not self._ssm_layers:
+            has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
+            new_arch = (
+                gguf.MODEL_ARCH.GRANITE_MOE
+                if has_experts else
+                gguf.MODEL_ARCH.GRANITE
+            )
+            self.model_arch = new_arch
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
+            self.gguf_writer.add_architecture()
+
         # n_group and d_inner are used during reshape_tensors for mamba2
         # NOTE: Explicitly include hparam prefix prefix for d_model to
         #   disambiguate with top-level head_dim
@@ -7740,8 +7755,11 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         self.gguf_writer.add_head_count_kv(head_count_kv_vec)
 
-        ## If Bamba, use rope, otherwise don't
-        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
+        ## If Bamba or non-hybrid, use rope, otherwise don't
+        use_rope = (
+            "BambaForCausalLM" in self.hparams["architectures"]
+            or not self._ssm_layers
+        )
         self.gguf_writer.add_rope_scaling_finetuned(use_rope)
         if not use_rope:
             self.gguf_writer.add_context_length(2**20)