Skip to content

Commit 1530ffb

Browse files
committed
Merge branch 'master' into esocrok
2 parents ab1a6a7 + 0889589 commit 1530ffb

File tree

23 files changed

+591
-87
lines changed

23 files changed

+591
-87
lines changed

.devops/s390x.Dockerfile

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
ARG GCC_VERSION=15.2.0
2+
ARG UBUNTU_VERSION=24.04
3+
4+
### Build Llama.cpp stage
5+
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
6+
7+
RUN --mount=type=cache,target=/var/cache/apt \
8+
--mount=type=cache,target=/var/lib/apt/lists \
9+
apt update -y && \
10+
apt upgrade -y && \
11+
apt install -y --no-install-recommends \
12+
git cmake ccache ninja-build \
13+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
14+
libopenblas-dev libcurl4-openssl-dev && \
15+
rm -rf /var/lib/apt/lists/*
16+
17+
WORKDIR /app
18+
COPY . .
19+
20+
RUN --mount=type=cache,target=/root/.ccache \
21+
--mount=type=cache,target=/app/build \
22+
cmake -S . -B build -G Ninja \
23+
-DCMAKE_BUILD_TYPE=Release \
24+
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
25+
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
26+
-DLLAMA_BUILD_TESTS=OFF \
27+
-DGGML_BACKEND_DL=OFF \
28+
-DGGML_NATIVE=OFF \
29+
-DGGML_BLAS=ON \
30+
-DGGML_BLAS_VENDOR=OpenBLAS && \
31+
cmake --build build --config Release -j $(nproc) && \
32+
cmake --install build --prefix /opt/llama.cpp
33+
34+
COPY *.py /opt/llama.cpp/bin
35+
COPY .devops/tools.sh /opt/llama.cpp/bin
36+
37+
COPY gguf-py /opt/llama.cpp/gguf-py
38+
COPY requirements.txt /opt/llama.cpp/gguf-py
39+
COPY requirements /opt/llama.cpp/gguf-py/requirements
40+
41+
42+
### Collect all llama.cpp binaries, libraries and distro libraries
43+
FROM --platform=linux/s390x scratch AS collector
44+
45+
# Copy llama.cpp binaries and libraries
46+
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
47+
COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib
48+
COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
49+
50+
51+
### Base image
52+
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
53+
54+
RUN --mount=type=cache,target=/var/cache/apt \
55+
--mount=type=cache,target=/var/lib/apt/lists \
56+
apt update -y && \
57+
apt install -y --no-install-recommends \
58+
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
59+
curl libgomp1 libopenblas-dev && \
60+
apt autoremove -y && \
61+
apt clean -y && \
62+
rm -rf /tmp/* /var/tmp/* && \
63+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
64+
find /var/cache -type f -delete
65+
66+
# Copy llama.cpp libraries
67+
COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
68+
69+
70+
### Full
71+
FROM --platform=linux/s390x base AS full
72+
73+
ENV PATH="/root/.cargo/bin:${PATH}"
74+
WORKDIR /app
75+
76+
RUN --mount=type=cache,target=/var/cache/apt \
77+
--mount=type=cache,target=/var/lib/apt/lists \
78+
apt update -y && \
79+
apt install -y \
80+
git cmake libjpeg-dev \
81+
python3 python3-pip python3-dev && \
82+
apt autoremove -y && \
83+
apt clean -y && \
84+
rm -rf /tmp/* /var/tmp/* && \
85+
find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
86+
find /var/cache -type f -delete
87+
88+
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
89+
90+
COPY --from=collector /llama.cpp/bin /app
91+
COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
92+
93+
RUN pip install --no-cache-dir --break-system-packages \
94+
-r /app/gguf-py/requirements.txt
95+
96+
ENTRYPOINT [ "/app/tools.sh" ]
97+
98+
99+
### CLI Only
100+
FROM --platform=linux/s390x base AS light
101+
102+
WORKDIR /llama.cpp/bin
103+
104+
# Copy llama.cpp binaries and libraries
105+
COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
106+
107+
ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
108+
109+
110+
### Server
111+
FROM --platform=linux/s390x base AS server
112+
113+
ENV LLAMA_ARG_HOST=0.0.0.0
114+
115+
WORKDIR /llama.cpp/bin
116+
117+
# Copy llama.cpp binaries and libraries
118+
COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
119+
120+
EXPOSE 8080
121+
122+
ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]

convert_hf_to_gguf.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7661,6 +7661,21 @@ def __init__(self, *args, **kwargs):
76617661
if i not in self._attn_layers
76627662
]
76637663

7664+
# There are some models in this family that are non-hybrid, but keep the
7665+
# same parent class by setting all layers to "attention." If this is the
7666+
# case, the model architecture needs to be updated to a standard
7667+
# "granite" or "granitemoe" model
7668+
if not self._ssm_layers:
7669+
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
7670+
new_arch = (
7671+
gguf.MODEL_ARCH.GRANITE_MOE
7672+
if has_experts else
7673+
gguf.MODEL_ARCH.GRANITE
7674+
)
7675+
self.model_arch = new_arch
7676+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
7677+
self.gguf_writer.add_architecture()
7678+
76647679
# n_group and d_inner are used during reshape_tensors for mamba2
76657680
# NOTE: Explicitly include hparam prefix prefix for d_model to
76667681
# disambiguate with top-level head_dim
@@ -7745,8 +7760,11 @@ def set_gguf_parameters(self):
77457760
self.gguf_writer.add_rope_dimension_count(rope_dim)
77467761
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
77477762

7748-
## If Bamba, use rope, otherwise don't
7749-
use_rope = "BambaForCausalLM" in self.hparams["architectures"]
7763+
## If Bamba or non-hybrid, use rope, otherwise don't
7764+
use_rope = (
7765+
"BambaForCausalLM" in self.hparams["architectures"]
7766+
or not self._ssm_layers
7767+
)
77507768
self.gguf_writer.add_rope_scaling_finetuned(use_rope)
77517769
if not use_rope:
77527770
self.gguf_writer.add_context_length(2**20)

docs/backend/zDNN.md

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# llama.cpp for IBM zDNN Accelerator
2+
3+
## Background
4+
5+
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
6+
7+
### Llama.cpp + IBM zDNN
8+
9+
The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
10+
11+
## Software & Hardware Support
12+
13+
| Hardware Level | Status | Verified |
14+
| -------------------- | ------------- | -------------------------- |
15+
| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs |
16+
| IBM z16 / LinuxONE 4 | Not Supported | |
17+
18+
## Data Types Supported
19+
20+
| Data Type | Status |
21+
| --------- | --------- |
22+
| F32 | Supported |
23+
| F16 | Supported |
24+
| BF16 | Supported |
25+
26+
## CMake Options
27+
28+
The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
29+
30+
| CMake Option | Default Value | Description |
31+
| ------------ | ------------- | ----------------------------------- |
32+
| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support |
33+
| `ZDNN_ROOT` | `""` | Override zDNN library lookup |
34+
35+
## 1. Install zDNN Library
36+
37+
Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
38+
39+
```sh
40+
git clone --recurse-submodules https://github.com/IBM/zDNN
41+
cd zDNN
42+
43+
autoreconf .
44+
./configure --prefix=/opt/zdnn-libs
45+
46+
make build
47+
sudo make install
48+
```
49+
50+
## 2. Build llama.cpp
51+
52+
```sh
53+
git clone https://github.com/ggml-org/llama.cpp
54+
cd llama.cpp
55+
56+
cmake -S . -G Ninja -B build \
57+
-DCMAKE_BUILD_TYPE=Release \
58+
-DGGML_ZDNN=ON \
59+
-DZDNN_ROOT=/opt/zdnn-libs
60+
cmake --build build --config Release -j$(nproc)
61+
```

ggml/src/ggml-cpu/arch/x86/repack.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
878878
const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879879
const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880880

881-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
881+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882882
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883883
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884884
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
12311231
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
12321232
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
12331233

1234-
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
1234+
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
12351235
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
12361236
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
12371237
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -477,10 +477,10 @@ struct ggml_threadpool {
477477
struct ggml_compute_state {
478478
#ifndef GGML_USE_OPENMP
479479
ggml_thread_t thrd;
480-
bool cpumask[GGML_MAX_N_THREADS];
481480
int last_graph;
482481
bool pending;
483482
#endif
483+
bool cpumask[GGML_MAX_N_THREADS];
484484
struct ggml_threadpool * threadpool;
485485
int ith;
486486
};
@@ -3947,7 +3947,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
39473947

39483948
threadpool->workers = workers;
39493949

3950-
#ifndef GGML_USE_OPENMP
3950+
#ifdef GGML_USE_OPENMP
3951+
int32_t cpumask_iter = 0;
3952+
3953+
// Compute CPU masks for each thread
3954+
for (int j = 0; j < tpp->n_threads; j++) {
3955+
ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3956+
}
3957+
#else // GGML_USE_OPENMP
39513958
ggml_mutex_init(&threadpool->mutex);
39523959
ggml_cond_init(&threadpool->cond);
39533960

@@ -4020,7 +4027,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
40204027
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
40214028
}
40224029

4023-
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
4030+
// Apply thread CPU mask and priority
4031+
int ith = omp_get_thread_num();
4032+
4033+
ggml_thread_apply_priority(threadpool->prio);
4034+
if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
4035+
ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
4036+
}
4037+
ggml_graph_compute_thread(&threadpool->workers[ith]);
40244038
}
40254039
} else {
40264040
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);

ggml/src/ggml-cpu/ops.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4739,6 +4739,7 @@ void ggml_compute_forward_get_rows(
47394739
//}
47404740
}
47414741

4742+
template<typename idx_t>
47424743
static void ggml_compute_forward_set_rows_f32(
47434744
const ggml_compute_params * params,
47444745
ggml_tensor * dst) {
@@ -4777,7 +4778,7 @@ static void ggml_compute_forward_set_rows_f32(
47774778
const int64_t i11 = i02%ne11;
47784779
const int64_t i10 = i;
47794780

4780-
const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
4781+
const int64_t i1 = *(idx_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
47814782

47824783
GGML_ASSERT(i1 >= 0 && i1 < ne1);
47834784

@@ -4794,11 +4795,18 @@ void ggml_compute_forward_set_rows(
47944795
ggml_tensor * dst) {
47954796

47964797
const ggml_tensor * src0 = dst->src[0];
4798+
const ggml_tensor * src1 = dst->src[1];
47974799

47984800
switch (src0->type) {
47994801
case GGML_TYPE_F32:
48004802
{
4801-
ggml_compute_forward_set_rows_f32(params, dst);
4803+
if (src1->type == GGML_TYPE_I64) {
4804+
ggml_compute_forward_set_rows_f32<int64_t>(params, dst);
4805+
} else if (src1->type == GGML_TYPE_I32) {
4806+
ggml_compute_forward_set_rows_f32<int32_t>(params, dst);
4807+
} else {
4808+
GGML_ABORT("src1->type = %d (%s) not supported", src1->type, ggml_type_name(src1->type));
4809+
}
48024810
} break;
48034811
default:
48044812
{

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3440,7 +3440,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
34403440
op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
34413441
op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
34423442
op->src[0]->type == GGML_TYPE_F32 &&
3443-
op->src[1]->type == GGML_TYPE_I64;
3443+
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
34443444
} break;
34453445
case GGML_OP_CPY:
34463446
{

0 commit comments

Comments
 (0)