Nexesenex
diff --git a/‎.devops/s390x.Dockerfile‎
Lines changed: 122 additions & 0 deletions b/‎.devops/s390x.Dockerfile‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎convert_hf_to_gguf.py‎
Lines changed: 20 additions & 2 deletions b/‎convert_hf_to_gguf.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎docs/backend/zDNN.md‎
Lines changed: 61 additions & 0 deletions b/‎docs/backend/zDNN.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cpu/arch/x86/repack.cpp‎
Lines changed: 2 additions & 2 deletions b/‎ggml/src/ggml-cpu/arch/x86/repack.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 17 additions & 3 deletions b/‎ggml/src/ggml-cpu/ggml-cpu.c‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 10 additions & 2 deletions b/‎ggml/src/ggml-cpu/ops.cpp‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cuda/ggml-cuda.cu‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,122 @@
+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+
+### Build Llama.cpp stage
+FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt upgrade -y && \
+    apt install -y --no-install-recommends \
+        git cmake ccache ninja-build \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        libopenblas-dev libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . .
+
+RUN --mount=type=cache,target=/root/.ccache \
+    --mount=type=cache,target=/app/build \
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_BACKEND_DL=OFF \
+        -DGGML_NATIVE=OFF \
+        -DGGML_BLAS=ON \
+        -DGGML_BLAS_VENDOR=OpenBLAS && \
+    cmake --build build --config Release -j $(nproc) && \
+    cmake --install build --prefix /opt/llama.cpp
+
+COPY *.py             /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+
+COPY gguf-py          /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements     /opt/llama.cpp/gguf-py/requirements
+
+
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM --platform=linux/s390x scratch AS collector
+
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+
+
+### Base image
+FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y --no-install-recommends \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        curl libgomp1 libopenblas-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+
+
+### Full
+FROM --platform=linux/s390x base AS full
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y \
+        git cmake libjpeg-dev \
+        python3 python3-pip python3-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+
+RUN pip install --no-cache-dir --break-system-packages \
+        -r /app/gguf-py/requirements.txt
+
+ENTRYPOINT [ "/app/tools.sh" ]
+
+
+### CLI Only
+FROM --platform=linux/s390x base AS light
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+
+
+### Server
+FROM --platform=linux/s390x base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+
+EXPOSE 8080
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
@@ -7661,6 +7661,21 @@ def __init__(self, *args, **kwargs):
             if i not in self._attn_layers
         ]
 
+        # There are some models in this family that are non-hybrid, but keep the
+        # same parent class by setting all layers to "attention." If this is the
+        # case, the model architecture needs to be updated to a standard
+        # "granite" or "granitemoe" model
+        if not self._ssm_layers:
+            has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
+            new_arch = (
+                gguf.MODEL_ARCH.GRANITE_MOE
+                if has_experts else
+                gguf.MODEL_ARCH.GRANITE
+            )
+            self.model_arch = new_arch
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
+            self.gguf_writer.add_architecture()
+
         # n_group and d_inner are used during reshape_tensors for mamba2
         # NOTE: Explicitly include hparam prefix prefix for d_model to
         #   disambiguate with top-level head_dim
@@ -7745,8 +7760,11 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         self.gguf_writer.add_head_count_kv(head_count_kv_vec)
 
-        ## If Bamba, use rope, otherwise don't
-        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
+        ## If Bamba or non-hybrid, use rope, otherwise don't
+        use_rope = (
+            "BambaForCausalLM" in self.hparams["architectures"]
+            or not self._ssm_layers
+        )
         self.gguf_writer.add_rope_scaling_finetuned(use_rope)
         if not use_rope:
             self.gguf_writer.add_context_length(2**20)
 
@@ -0,0 +1,61 @@
+# llama.cpp for IBM zDNN Accelerator
+
+## Background
+
+IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
+
+### Llama.cpp + IBM zDNN
+
+The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
+
+## Software & Hardware Support
+
+| Hardware Level       | Status        | Verified                   |
+| -------------------- | ------------- | -------------------------- |
+| IBM z17 / LinuxONE 5 | Supported     | RHEL 9.6, IBM z17, 40 IFLs |
+| IBM z16 / LinuxONE 4 | Not Supported |                            |
+
+## Data Types Supported
+
+| Data Type | Status    |
+| --------- | --------- |
+| F32       | Supported |
+| F16       | Supported |
+| BF16      | Supported |
+
+## CMake Options
+
+The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
+
+| CMake Option | Default Value | Description                         |
+| ------------ | ------------- | ----------------------------------- |
+| `GGML_ZDNN`  | `OFF`         | Compile llama.cpp with zDNN support |
+| `ZDNN_ROOT`  | `""`          | Override zDNN library lookup        |
+
+## 1. Install zDNN Library
+
+Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
+
+```sh
+git clone --recurse-submodules https://github.com/IBM/zDNN
+cd zDNN
+
+autoreconf .
+./configure --prefix=/opt/zdnn-libs
+
+make build
+sudo make install
+```
+
+## 2. Build llama.cpp
+
+```sh
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+
+cmake -S . -G Ninja -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_ZDNN=ON \
+    -DZDNN_ROOT=/opt/zdnn-libs
+cmake --build build --config Release -j$(nproc)
+```
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                 const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
                 const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
 
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
                 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
                 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
                 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                 const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
                 const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
 
-                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
+                // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
                 const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
                 const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
                 const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
 
@@ -477,10 +477,10 @@ struct ggml_threadpool {
 struct ggml_compute_state {
 #ifndef GGML_USE_OPENMP
     ggml_thread_t thrd;
-    bool cpumask[GGML_MAX_N_THREADS];
     int  last_graph;
     bool pending;
 #endif
+    bool cpumask[GGML_MAX_N_THREADS];
     struct ggml_threadpool * threadpool;
     int ith;
 };
@@ -3947,7 +3947,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
 
     threadpool->workers = workers;
 
-#ifndef GGML_USE_OPENMP
+#ifdef GGML_USE_OPENMP
+    int32_t cpumask_iter = 0;
+
+    // Compute CPU masks for each thread
+    for (int j = 0; j < tpp->n_threads; j++) {
+        ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
+    }
+#else // GGML_USE_OPENMP
     ggml_mutex_init(&threadpool->mutex);
     ggml_cond_init(&threadpool->cond);
 
@@ -4020,7 +4027,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                 atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
             }
 
-            ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
+            // Apply thread CPU mask and priority
+            int ith = omp_get_thread_num();
+
+            ggml_thread_apply_priority(threadpool->prio);
+            if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
+                ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
+            }
+            ggml_graph_compute_thread(&threadpool->workers[ith]);
         }
     } else {
         atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
 
@@ -4739,6 +4739,7 @@ void ggml_compute_forward_get_rows(
     //}
 }
 
+template<typename idx_t>
 static void ggml_compute_forward_set_rows_f32(
         const ggml_compute_params * params,
               ggml_tensor * dst) {
@@ -4777,7 +4778,7 @@ static void ggml_compute_forward_set_rows_f32(
                 const int64_t i11 = i02%ne11;
                 const int64_t i10 = i;
 
-                const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                const int64_t i1 = *(idx_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
 
                 GGML_ASSERT(i1 >= 0 && i1 < ne1);
 
@@ -4794,11 +4795,18 @@ void ggml_compute_forward_set_rows(
         ggml_tensor * dst) {
 
     const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
 
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_set_rows_f32(params, dst);
+                if (src1->type == GGML_TYPE_I64) {
+                    ggml_compute_forward_set_rows_f32<int64_t>(params, dst);
+                } else if (src1->type == GGML_TYPE_I32) {
+                    ggml_compute_forward_set_rows_f32<int32_t>(params, dst);
+                } else {
+                    GGML_ABORT("src1->type = %d (%s) not supported", src1->type, ggml_type_name(src1->type));
+                }
             } break;
         default:
             {
 
@@ -3440,7 +3440,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                        op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
                        op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
                        op->src[0]->type == GGML_TYPE_F32 &&
-                       op->src[1]->type == GGML_TYPE_I64;
+                       (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
             } break;
         case GGML_OP_CPY:
             {
Original file line number	Diff line number	Diff line change
`@@ -3440,7 +3440,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g`
`3440`	`3440`	`op->type == GGML_TYPE_Q4_0 \|\| op->type == GGML_TYPE_Q4_1 \|\| op->type == GGML_TYPE_Q5_0 \|\|`
`3441`	`3441`	`op->type == GGML_TYPE_Q5_1 \|\| op->type == GGML_TYPE_Q8_0 \|\| op->type == GGML_TYPE_IQ4_NL) &&`
`3442`	`3442`	`op->src[0]->type == GGML_TYPE_F32 &&`
`3443`		`- op->src[1]->type == GGML_TYPE_I64;`
	`3443`	`+ (op->src[1]->type == GGML_TYPE_I64 \|\| op->src[1]->type == GGML_TYPE_I32);`
`3444`	`3444`	`} break;`
`3445`	`3445`	`case GGML_OP_CPY:`
`3446`	`3446`	`{`