Skip to content

Commit 464ee11

Browse files
committed
Merge branch 'master' into esocrok
2 parents 7e6e833 + 4ae88d0 commit 464ee11

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1682
-230
lines changed

.devops/s390x.Dockerfile

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
ARG GCC_VERSION=15.2.0
2+
ARG UBUNTU_VERSION=24.04
3+
4+
### Build Llama.cpp stage
5+
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
6+
7+
RUN --mount=type=cache,target=/var/cache/apt \
8+
--mount=type=cache,target=/var/lib/apt/lists \
9+
apt update -y && \
10+
apt upgrade -y && \
11+
apt install -y --no-install-recommends \
12+
git cmake ccache ninja-build \
13+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
14+
libopenblas-dev libcurl4-openssl-dev && \
15+
rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /app
18+
COPY . .
19+
20+
RUN --mount=type=cache,target=/root/.ccache \
21+
--mount=type=cache,target=/app/build \
22+
cmake -S . -B build -G Ninja \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
25+
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
26+
-DLLAMA_BUILD_TESTS=OFF \
27+
-DGGML_BACKEND_DL=OFF \
28+
-DGGML_NATIVE=OFF \
29+
-DGGML_BLAS=ON \
30+
-DGGML_BLAS_VENDOR=OpenBLAS && \
31+
cmake --build build --config Release -j $(nproc) && \
32+
cmake --install build --prefix /opt/llama.cpp
33+
34+
COPY *.py /opt/llama.cpp/bin
35+
COPY .devops/tools.sh /opt/llama.cpp/bin
36+
37+
COPY gguf-py /opt/llama.cpp/gguf-py
38+
COPY requirements.txt /opt/llama.cpp/gguf-py
39+
COPY requirements /opt/llama.cpp/gguf-py/requirements
40+
41+
42+
### Collect all llama.cpp binaries, libraries and distro libraries
43+
FROM --platform=linux/s390x scratch AS collector
44+
45+
# Copy llama.cpp binaries and libraries
46+
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
47+
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
48+
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
49+
50+
51+
### Base image
52+
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
53+
54+
RUN --mount=type=cache,target=/var/cache/apt \
55+
--mount=type=cache,target=/var/lib/apt/lists \
56+
apt update -y && \
57+
apt install -y --no-install-recommends \
58+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
59+
curl libgomp1 libopenblas-dev && \
60+
apt autoremove -y && \
61+
apt clean -y && \
62+
rm -rf /tmp/* /var/tmp/* && \
63+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
64+
find /var/cache -type f -delete
65+
66+
# Copy llama.cpp libraries
67+
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
68+
69+
70+
### Full
71+
FROM --platform=linux/s390x base AS full
72+
73+
ENV PATH="/root/.cargo/bin:${PATH}"
74+
WORKDIR /app
75+
76+
RUN --mount=type=cache,target=/var/cache/apt \
77+
--mount=type=cache,target=/var/lib/apt/lists \
78+
apt update -y && \
79+
apt install -y \
80+
git cmake libjpeg-dev \
81+
python3 python3-pip python3-dev && \
82+
apt autoremove -y && \
83+
apt clean -y && \
84+
rm -rf /tmp/* /var/tmp/* && \
85+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
86+
find /var/cache -type f -delete
87+
88+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
89+
90+
COPY --from=collector /llama.cpp/bin /app
91+
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
92+
93+
RUN pip install --no-cache-dir --break-system-packages \
94+
-r /app/gguf-py/requirements.txt
95+
96+
ENTRYPOINT [ "/app/tools.sh" ]
97+
98+
99+
### CLI Only
100+
FROM --platform=linux/s390x base AS light
101+
102+
WORKDIR /llama.cpp/bin
103+
104+
# Copy llama.cpp binaries and libraries
105+
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
106+
107+
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
108+
109+
110+
### Server
111+
FROM --platform=linux/s390x base AS server
112+
113+
ENV LLAMA_ARG_HOST=0.0.0.0
114+
115+
WORKDIR /llama.cpp/bin
116+
117+
# Copy llama.cpp binaries and libraries
118+
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
119+
120+
EXPOSE 8080
121+
122+
ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <climits>
2323
#include <cmath>
2424
#include <codecvt>
25+
#include <chrono>
2526
#include <cstdarg>
2627
#include <cstring>
2728
#include <ctime>

common/sampling.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332332
}
333333
if (ctx) {
334334
llama_perf_context_print(ctx);
335+
llama_memory_breakdown_print(ctx);
335336
}
336337
}
337338

convert_hf_to_gguf.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7661,6 +7661,21 @@ def __init__(self, *args, **kwargs):
76617661
if i not in self._attn_layers
76627662
]
76637663

7664+
# There are some models in this family that are non-hybrid, but keep the
7665+
# same parent class by setting all layers to "attention." If this is the
7666+
# case, the model architecture needs to be updated to a standard
7667+
# "granite" or "granitemoe" model
7668+
if not self._ssm_layers:
7669+
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
7670+
new_arch = (
7671+
gguf.MODEL_ARCH.GRANITE_MOE
7672+
if has_experts else
7673+
gguf.MODEL_ARCH.GRANITE
7674+
)
7675+
self.model_arch = new_arch
7676+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
7677+
self.gguf_writer.add_architecture()
7678+
76647679
# n_group and d_inner are used during reshape_tensors for mamba2
76657680
# NOTE: Explicitly include hparam prefix prefix for d_model to
76667681
# disambiguate with top-level head_dim
@@ -7745,8 +7760,11 @@ def set_gguf_parameters(self):
77457760
self.gguf_writer.add_rope_dimension_count(rope_dim)
77467761
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
77477762

7748-
## If Bamba, use rope, otherwise don't
7749-
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
7763+
## If Bamba or non-hybrid, use rope, otherwise don't
7764+
use_rope = (
7765+
"BambaForCausalLM" in self.hparams["architectures"]
7766+
or not self._ssm_layers
7767+
)
77507768
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
77517769
if not use_rope:
77527770
self.gguf_writer.add_context_length(2**20)

docs/backend/zDNN.md

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# llama.cpp for IBM zDNN Accelerator
2+
3+
## Background
4+
5+
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
6+
7+
### Llama.cpp + IBM zDNN
8+
9+
The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
10+
11+
## Software & Hardware Support
12+
13+
| Hardware Level | Status | Verified |
14+
| -------------------- | ------------- | -------------------------- |
15+
| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs |
16+
| IBM z16 / LinuxONE 4 | Not Supported | |
17+
18+
## Data Types Supported
19+
20+
| Data Type | Status |
21+
| --------- | --------- |
22+
| F32 | Supported |
23+
| F16 | Supported |
24+
| BF16 | Supported |
25+
26+
## CMake Options
27+
28+
The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
29+
30+
| CMake Option | Default Value | Description |
31+
| ------------ | ------------- | ----------------------------------- |
32+
| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support |
33+
| `ZDNN_ROOT` | `""` | Override zDNN library lookup |
34+
35+
## 1. Install zDNN Library
36+
37+
Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
38+
39+
```sh
40+
git clone --recurse-submodules https://github.com/IBM/zDNN
41+
cd zDNN
42+
43+
autoreconf .
44+
./configure --prefix=/opt/zdnn-libs
45+
46+
make build
47+
sudo make install
48+
```
49+
50+
## 2. Build llama.cpp
51+
52+
```sh
53+
git clone https://github.com/ggml-org/llama.cpp
54+
cd llama.cpp
55+
56+
cmake -S . -G Ninja -B build \
57+
-DCMAKE_BUILD_TYPE=Release \
58+
-DGGML_ZDNN=ON \
59+
-DZDNN_ROOT=/opt/zdnn-libs
60+
cmake --build build --config Release -j$(nproc)
61+
```

ggml/include/ggml-backend.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,8 @@ extern "C" {
314314
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315315
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316316

317-
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
317+
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
318+
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318319

319320
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320321
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

0 commit comments

Comments
 (0)