Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Checks: >
clang-analyzer-*,
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
performance-*,
-performance-enum-size,
portability-*,
-portability-simd-intrinsics,
misc-*,
Expand Down
122 changes: 122 additions & 0 deletions .devops/s390x.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
ARG GCC_VERSION=15.2.0
ARG UBUNTU_VERSION=24.04

### Build Llama.cpp stage
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build

RUN --mount=type=cache,target=/var/cache/apt \
--mount=type=cache,target=/var/lib/apt/lists \
apt update -y && \
apt upgrade -y && \
apt install -y --no-install-recommends \
git cmake ccache ninja-build \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
libopenblas-dev libcurl4-openssl-dev && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app
COPY . .

RUN --mount=type=cache,target=/root/.ccache \
--mount=type=cache,target=/app/build \
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_BACKEND_DL=OFF \
-DGGML_NATIVE=OFF \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS && \
cmake --build build --config Release -j $(nproc) && \
cmake --install build --prefix /opt/llama.cpp

COPY *.py /opt/llama.cpp/bin
COPY .devops/tools.sh /opt/llama.cpp/bin

COPY gguf-py /opt/llama.cpp/gguf-py
COPY requirements.txt /opt/llama.cpp/gguf-py
COPY requirements /opt/llama.cpp/gguf-py/requirements


### Collect all llama.cpp binaries, libraries and distro libraries
FROM --platform=linux/s390x scratch AS collector

# Copy llama.cpp binaries and libraries
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py


### Base image
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base

RUN --mount=type=cache,target=/var/cache/apt \
--mount=type=cache,target=/var/lib/apt/lists \
apt update -y && \
apt install -y --no-install-recommends \
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
curl libgomp1 libopenblas-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

# Copy llama.cpp libraries
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu


### Full
FROM --platform=linux/s390x base AS full

ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /app

RUN --mount=type=cache,target=/var/cache/apt \
--mount=type=cache,target=/var/lib/apt/lists \
apt update -y && \
apt install -y \
git cmake libjpeg-dev \
python3 python3-pip python3-dev && \
apt autoremove -y && \
apt clean -y && \
rm -rf /tmp/* /var/tmp/* && \
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
find /var/cache -type f -delete

RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

COPY --from=collector /llama.cpp/bin /app
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py

RUN pip install --no-cache-dir --break-system-packages \
-r /app/gguf-py/requirements.txt

ENTRYPOINT [ "/app/tools.sh" ]


### CLI Only
FROM --platform=linux/s390x base AS light

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]


### Server
FROM --platform=linux/s390x base AS server

ENV LLAMA_ARG_HOST=0.0.0.0

WORKDIR /llama.cpp/bin

# Copy llama.cpp binaries and libraries
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

EXPOSE 8080

ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
1 change: 1 addition & 0 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ jobs:
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
steps:
Expand Down
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
/examples/llama.vim @ggerganov
/examples/lookahead/ @ggerganov
/examples/lookup/ @JohannesGaessler
/examples/model-conversion/ @danbev
/examples/parallel/ @ggerganov
/examples/passkey/ @ggerganov
/examples/retrieval/ @ggerganov
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
| [Vulkan](docs/build.md#vulkan) | GPU |
| [CANN](docs/build.md#cann) | Ascend NPU |
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |

Expand Down
22 changes: 20 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7656,6 +7656,21 @@ def __init__(self, *args, **kwargs):
if i not in self._attn_layers
]

# There are some models in this family that are non-hybrid, but keep the
# same parent class by setting all layers to "attention." If this is the
# case, the model architecture needs to be updated to a standard
# "granite" or "granitemoe" model
if not self._ssm_layers:
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
new_arch = (
gguf.MODEL_ARCH.GRANITE_MOE
if has_experts else
gguf.MODEL_ARCH.GRANITE
)
self.model_arch = new_arch
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
self.gguf_writer.add_architecture()

# n_group and d_inner are used during reshape_tensors for mamba2
# NOTE: Explicitly include hparam prefix prefix for d_model to
# disambiguate with top-level head_dim
Expand Down Expand Up @@ -7740,8 +7755,11 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_dimension_count(rope_dim)
self.gguf_writer.add_head_count_kv(head_count_kv_vec)

## If Bamba, use rope, otherwise don't
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
## If Bamba or non-hybrid, use rope, otherwise don't
use_rope = (
"BambaForCausalLM" in self.hparams["architectures"]
or not self._ssm_layers
)
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
if not use_rope:
self.gguf_writer.add_context_length(2**20)
Expand Down
61 changes: 61 additions & 0 deletions docs/backend/zDNN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# llama.cpp for IBM zDNN Accelerator

## Background

IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.

### Llama.cpp + IBM zDNN

The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.

## Software & Hardware Support

| Hardware Level | Status | Verified |
| -------------------- | ------------- | -------------------------- |
| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs |
| IBM z16 / LinuxONE 4 | Not Supported | |

## Data Types Supported

| Data Type | Status |
| --------- | --------- |
| F32 | Supported |
| F16 | Supported |
| BF16 | Supported |

## CMake Options

The IBM zDNN backend has the following CMake options that control the behaviour of the backend.

| CMake Option | Default Value | Description |
| ------------ | ------------- | ----------------------------------- |
| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support |
| `ZDNN_ROOT` | `""` | Override zDNN library lookup |

## 1. Install zDNN Library

Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.

```sh
git clone --recurse-submodules https://github.com/IBM/zDNN
cd zDNN

autoreconf .
./configure --prefix=/opt/zdnn-libs

make build
sudo make install
```

## 2. Build llama.cpp

```sh
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp

cmake -S . -G Ninja -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_ZDNN=ON \
-DZDNN_ROOT=/opt/zdnn-libs
cmake --build build --config Release -j$(nproc)
```
3 changes: 3 additions & 0 deletions ggml/include/ggml-zdnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
extern "C" {
#endif

// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);

GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);

#ifdef __cplusplus
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cpu/arch/x86/repack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));

// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
Expand Down Expand Up @@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));

// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
Expand Down
20 changes: 17 additions & 3 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -473,10 +473,10 @@ struct ggml_threadpool {
struct ggml_compute_state {
#ifndef GGML_USE_OPENMP
ggml_thread_t thrd;
bool cpumask[GGML_MAX_N_THREADS];
int last_graph;
bool pending;
#endif
bool cpumask[GGML_MAX_N_THREADS];
struct ggml_threadpool * threadpool;
int ith;
};
Expand Down Expand Up @@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(

threadpool->workers = workers;

#ifndef GGML_USE_OPENMP
#ifdef GGML_USE_OPENMP
int32_t cpumask_iter = 0;

// Compute CPU masks for each thread
for (int j = 0; j < tpp->n_threads; j++) {
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
}
#else // GGML_USE_OPENMP
ggml_mutex_init(&threadpool->mutex);
ggml_cond_init(&threadpool->cond);

Expand Down Expand Up @@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
}

ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
// Apply thread CPU mask and priority
int ith = omp_get_thread_num();

ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
}
ggml_graph_compute_thread(&threadpool->workers[ith]);
}
} else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -3721,6 +3721,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT
}
float best = 0;
float scale = max/(2*kMaxQ-1);
for (int k = 0; k < 8; ++k) is_on_grid[k] = true;
for (int is = -15; is <= 15; ++is) {
float id = (2*kMaxQ-1+is*0.2f)/max;
float this_scale = 1/id;
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-zdnn/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
zdnn.h
59 changes: 59 additions & 0 deletions ggml/src/ggml-zdnn/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#ifndef GGML_ZDNN_COMMON_HPP
#define GGML_ZDNN_COMMON_HPP

#include "ggml.h"
#include "ggml-impl.h"

#include "zdnn.h"

#include <vector>
#include <memory>

#define GGML_ZDNN_NAME "zDNN"
#define GGML_ZDNN_VERSION ZDNN_VERNUM

#define ZDNN_CHECK(stmt) \
do { \
zdnn_status status = (stmt); \
GGML_ASSERT(status == ZDNN_OK); \
} while (0);

struct ggml_backend_zdnn_device_context {
int zdnn_device;
int zdnn_device_ref_count;

bool has_parmblkformat_0;
bool has_parmblkformat_1; // checks for z17

size_t max_size;

char name[128];
};

struct ggml_backend_zdnn_context {
int device;
ggml_cgraph * gf;
};

struct ggml_backend_zdnn_buffer {
void * data;
ggml_backend_zdnn_buffer * extra; // for bias, etc.
size_t size;

zdnn_tensor_desc pre_tfm_desc;
zdnn_tensor_desc tfm_desc;
zdnn_ztensor ztensor;

char name[GGML_MAX_NAME];
};

struct ggml_backend_zdnn_buffer_context {
void * all_data;
size_t all_size;
bool owned;

int n_buffers;
std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
};

#endif // GGML_ZDNN_COMMON_HPP
Loading