From 351f3da39c85f59d581fc184f09283da7f099a3b Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 23 Sep 2025 01:57:46 +0800 Subject: [PATCH 1/8] clang-tidy : disable warning about performance enum size (#16127) Disable 'performance-enum-size' checking: Enum 'llama_token_type' uses a larger base type ('unsigned int', size: 4 bytes) than necessary for its value set, consider using 'std::uint8_t' (1 byte) as the base type to reduce its size. --- .clang-tidy | 1 + 1 file changed, 1 insertion(+) diff --git a/.clang-tidy b/.clang-tidy index 5bc63bc6e27b6..803b8b46a32f3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,6 +17,7 @@ Checks: > clang-analyzer-*, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, + -performance-enum-size, portability-*, -portability-simd-intrinsics, misc-*, From 1d0125bcf1cbd7195ad0faf826a20bc7cec7d3f4 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 22 Sep 2025 12:40:10 -0600 Subject: [PATCH 2/8] feat: Add conversion support in GraniteHybrid for non-hybrid (all attn) (#16177) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a configuration of the hparams in the GraniteHybrid architecture that devolves to the Granite (or GraniteMoe) architecture (ie Granite 3.x). It may be used for some models in the Granite 4 family with the GraniteHybrid architecture acting as a superset arch. Rather than support it directly in the c++ graph, we simply coerce the architecture flag back to the correct "granite" or "granitemoe" architecture. Branch: gabe-l-hart/GraniteNonHybridConversion Signed-off-by: Gabe Goodhart Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7ddec48ad7129..9ebd8567ad23f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7656,6 +7656,21 @@ def __init__(self, *args, **kwargs): if i not in self._attn_layers ] + # There are some models in this family that are non-hybrid, but keep the + # same parent class by setting all layers to "attention." If this is the + # case, the model architecture needs to be updated to a standard + # "granite" or "granitemoe" model + if not self._ssm_layers: + has_experts = self.find_hparam(["num_experts_per_tok"], optional=True) + new_arch = ( + gguf.MODEL_ARCH.GRANITE_MOE + if has_experts else + gguf.MODEL_ARCH.GRANITE + ) + self.model_arch = new_arch + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] + self.gguf_writer.add_architecture() + # n_group and d_inner are used during reshape_tensors for mamba2 # NOTE: Explicitly include hparam prefix prefix for d_model to # disambiguate with top-level head_dim @@ -7740,8 +7755,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(rope_dim) self.gguf_writer.add_head_count_kv(head_count_kv_vec) - ## If Bamba, use rope, otherwise don't - use_rope = "BambaForCausalLM" in self.hparams["architectures"] + ## If Bamba or non-hybrid, use rope, otherwise don't + use_rope = ( + "BambaForCausalLM" in self.hparams["architectures"] + or not self._ssm_layers + ) self.gguf_writer.add_rope_scaling_finetuned(use_rope) if not use_rope: self.gguf_writer.add_context_length(2**20) From 85e72271ba1ce78adf34fd8997803c991e617ca6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 23 Sep 2025 05:59:03 +0200 Subject: [PATCH 3/8] ggml-cpu : fix typo in gemm comments [no ci] (#16189) --- ggml/src/ggml-cpu/arch/x86/repack.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index d95bb6d8aafce..fe18225c28137 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64)); const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96)); - // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess + // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); @@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64)); const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96)); - // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess + // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240); const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240); const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240); From 4b9f4cb0f89a88de4bdf97727d0457b0c648804c Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 23 Sep 2025 13:59:34 +0800 Subject: [PATCH 4/8] devops: add s390x containers (#15915) * devops: add s390x dockerfile Signed-off-by: Aaron Teo * devops: add missing ninja Signed-off-by: Aaron Teo * devops: move s390x docker into cpu docker Signed-off-by: Aaron Teo * devops: rework s390x docker Signed-off-by: Aaron Teo * devops: copy more tools Signed-off-by: Aaron Teo * devops: add server build step Signed-off-by: Aaron Teo * devops: remove apt clean steps as distroless misses it Signed-off-by: Aaron Teo * devops: remove apt commands from distroless Signed-off-by: Aaron Teo * devops: fix shared libs in distroless Signed-off-by: Aaron Teo * devops: use correct libs path Signed-off-by: Aaron Teo * devops: fix shared libs Signed-off-by: Aaron Teo * devops: add collector stage Signed-off-by: Aaron Teo * devops: fix missing stage ref Signed-off-by: Aaron Teo * devops: fix permission issue Signed-off-by: Aaron Teo * devops: fix unknown model loading failures Signed-off-by: Aaron Teo * devops: attempt at fixing model loading failure Signed-off-by: Aaron Teo * devops: fix missing ggml shared object failure to load model Signed-off-by: Aaron Teo * devops: remove move shared objects Signed-off-by: Aaron Teo * devops: move libggml-cpu and blas into bin Signed-off-by: Aaron Teo * devops: finalise hardened server stage Signed-off-by: Aaron Teo * devops: add cli target Signed-off-by: Aaron Teo * devops: fix typos Signed-off-by: Aaron Teo * devops: fix missing shared libraries in base Signed-off-by: Aaron Teo * devops: update debian target Signed-off-by: Aaron Teo * devops: formalise llama.cpp loc Signed-off-by: Aaron Teo * Revert "devops: formalise llama.cpp loc" This reverts commit 0a7664af8466a15f318ff209e02ac3c4e551cc18. Signed-off-by: Aaron Teo * devops: formalise llama.cpp loc Signed-off-by: Aaron Teo (cherry picked from commit 0a7664af8466a15f318ff209e02ac3c4e551cc18) Signed-off-by: Aaron Teo * devops: attempt at fixing missing dir Signed-off-by: Aaron Teo * devops: attempt at making it cache the build Signed-off-by: Aaron Teo * devops: fix copying process Signed-off-by: Aaron Teo * devops: make build dir an argument Signed-off-by: Aaron Teo * Revert "devops: make build dir an argument" This reverts commit 438698976b8a5181c1e8179600527cfd5a50cc23. Signed-off-by: Aaron Teo * devops: add build stage for gguf-py Signed-off-by: Aaron Teo * devops: move gguf-py installation into build stage Signed-off-by: Aaron Teo * devops: break system packages? Signed-off-by: Aaron Teo * devops: add rust compiler installer Signed-off-by: Aaron Teo * devops: fix rustc not found Signed-off-by: Aaron Teo * devops: remove cache mount to allow rustc to persist Signed-off-by: Aaron Teo * devops: move rustc installation to another layer Signed-off-by: Aaron Teo * devops: move gguf-py installation to full stage, fix copying Signed-off-by: Aaron Teo * devops: remove rustc installation in build Signed-off-by: Aaron Teo * devops: disable full target for now Signed-off-by: Aaron Teo * devops: attempting static build Signed-off-by: Aaron Teo * devops: merge s390x dockerfile into cpu for now Signed-off-by: Aaron Teo * devops: switch to gcc image for build step Signed-off-by: Aaron Teo * devops: remove build essentials Signed-off-by: Aaron Teo * devops: install openblas into base target Signed-off-by: Aaron Teo * devops: go back to s390x dockerfile Signed-off-by: Aaron Teo * devops: remove libggml and libblas Signed-off-by: Aaron Teo * devops: add full target Signed-off-by: Aaron Teo * devops: add break system packages Signed-off-by: Aaron Teo * devops: add libjpeg Signed-off-by: Aaron Teo * devops: add missing cmake dep Signed-off-by: Aaron Teo * devops: finalise docker images for s390x Signed-off-by: Aaron Teo * devops: add custom openblas patch Signed-off-by: Aaron Teo * devops: use libopenblas-dev instead of libopenblas-openmp-dev Signed-off-by: Aaron Teo * devops: add s390x docker build Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- .devops/s390x.Dockerfile | 122 +++++++++++++++++++++++++++++++++++ .github/workflows/docker.yml | 1 + 2 files changed, 123 insertions(+) create mode 100644 .devops/s390x.Dockerfile diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile new file mode 100644 index 0000000000000..90d2b3538a025 --- /dev/null +++ b/.devops/s390x.Dockerfile @@ -0,0 +1,122 @@ +ARG GCC_VERSION=15.2.0 +ARG UBUNTU_VERSION=24.04 + +### Build Llama.cpp stage +FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build + +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt/lists \ + apt update -y && \ + apt upgrade -y && \ + apt install -y --no-install-recommends \ + git cmake ccache ninja-build \ + # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster. + libopenblas-dev libcurl4-openssl-dev && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY . . + +RUN --mount=type=cache,target=/root/.ccache \ + --mount=type=cache,target=/app/build \ + cmake -S . -B build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DLLAMA_BUILD_TESTS=OFF \ + -DGGML_BACKEND_DL=OFF \ + -DGGML_NATIVE=OFF \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS && \ + cmake --build build --config Release -j $(nproc) && \ + cmake --install build --prefix /opt/llama.cpp + +COPY *.py /opt/llama.cpp/bin +COPY .devops/tools.sh /opt/llama.cpp/bin + +COPY gguf-py /opt/llama.cpp/gguf-py +COPY requirements.txt /opt/llama.cpp/gguf-py +COPY requirements /opt/llama.cpp/gguf-py/requirements + + +### Collect all llama.cpp binaries, libraries and distro libraries +FROM --platform=linux/s390x scratch AS collector + +# Copy llama.cpp binaries and libraries +COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin +COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib +COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py + + +### Base image +FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base + +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt/lists \ + apt update -y && \ + apt install -y --no-install-recommends \ + # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster. + curl libgomp1 libopenblas-dev && \ + apt autoremove -y && \ + apt clean -y && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete + +# Copy llama.cpp libraries +COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu + + +### Full +FROM --platform=linux/s390x base AS full + +ENV PATH="/root/.cargo/bin:${PATH}" +WORKDIR /app + +RUN --mount=type=cache,target=/var/cache/apt \ + --mount=type=cache,target=/var/lib/apt/lists \ + apt update -y && \ + apt install -y \ + git cmake libjpeg-dev \ + python3 python3-pip python3-dev && \ + apt autoremove -y && \ + apt clean -y && \ + rm -rf /tmp/* /var/tmp/* && \ + find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ + find /var/cache -type f -delete + +RUN curl https://sh.rustup.rs -sSf | bash -s -- -y + +COPY --from=collector /llama.cpp/bin /app +COPY --from=collector /llama.cpp/gguf-py /app/gguf-py + +RUN pip install --no-cache-dir --break-system-packages \ + -r /app/gguf-py/requirements.txt + +ENTRYPOINT [ "/app/tools.sh" ] + + +### CLI Only +FROM --platform=linux/s390x base AS light + +WORKDIR /llama.cpp/bin + +# Copy llama.cpp binaries and libraries +COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin + +ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ] + + +### Server +FROM --platform=linux/s390x base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +WORKDIR /llama.cpp/bin + +# Copy llama.cpp binaries and libraries +COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin + +EXPOSE 8080 + +ENTRYPOINT [ "/llama.cpp/bin/llama-server" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2067927be56ca..542621b077225 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -44,6 +44,7 @@ jobs: - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } + - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false } # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true } steps: From 0bc7cc715472fe28822e57f036f0746592ed2c04 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 23 Sep 2025 08:13:22 +0200 Subject: [PATCH 5/8] codeowners : add @danbev to model-conversion example [no ci] (#16190) This commit adds examples/model-conversion/ to the CODEOWNERS file and assigns myself (@danbev) as the code owner for this directory. --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 5460003c7d193..d5a631ad56a20 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -34,6 +34,7 @@ /examples/llama.vim @ggerganov /examples/lookahead/ @ggerganov /examples/lookup/ @JohannesGaessler +/examples/model-conversion/ @danbev /examples/parallel/ @ggerganov /examples/passkey/ @ggerganov /examples/retrieval/ @ggerganov From 264f1b51872c125e23fa0ac1da5e2a1170de9a08 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 23 Sep 2025 14:53:05 +0800 Subject: [PATCH 6/8] zdnn: refactor codebase + add docs (#16178) * zdnn: initial matmul refactor Signed-off-by: Aaron Teo * ggml-zdnn: rm static from funcs Signed-off-by: Aaron Teo * ggml-zdnn: update ggml-zdnn.h Signed-off-by: Aaron Teo * ggml-zdnn: change header files to hpp Signed-off-by: Aaron Teo * ggml-zdnn: switch to common.hpp Signed-off-by: Aaron Teo * ggml-zdnn: move mulmat forward around Signed-off-by: Aaron Teo * ggml-zdnn: rm inline from utils Signed-off-by: Aaron Teo * ggml-zdnn: code cleanup Signed-off-by: Aaron Teo * docs: add zDNN docs Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- README.md | 1 + docs/backend/zDNN.md | 61 +++++++++ ggml/include/ggml-zdnn.h | 3 + ggml/src/ggml-zdnn/.gitignore | 1 + ggml/src/ggml-zdnn/common.hpp | 59 +++++++++ ggml/src/ggml-zdnn/ggml-zdnn-impl.h | 98 --------------- ggml/src/ggml-zdnn/ggml-zdnn.cpp | 187 +++------------------------- ggml/src/ggml-zdnn/mmf.cpp | 80 ++++++++++++ ggml/src/ggml-zdnn/mmf.hpp | 12 ++ ggml/src/ggml-zdnn/utils.cpp | 79 ++++++++++++ ggml/src/ggml-zdnn/utils.hpp | 19 +++ 11 files changed, 334 insertions(+), 266 deletions(-) create mode 100644 docs/backend/zDNN.md create mode 100644 ggml/src/ggml-zdnn/.gitignore create mode 100644 ggml/src/ggml-zdnn/common.hpp delete mode 100644 ggml/src/ggml-zdnn/ggml-zdnn-impl.h create mode 100644 ggml/src/ggml-zdnn/mmf.cpp create mode 100644 ggml/src/ggml-zdnn/mmf.hpp create mode 100644 ggml/src/ggml-zdnn/utils.cpp create mode 100644 ggml/src/ggml-zdnn/utils.hpp diff --git a/README.md b/README.md index fcd7014a20111..17754b6627976 100644 --- a/README.md +++ b/README.md @@ -274,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Vulkan](docs/build.md#vulkan) | GPU | | [CANN](docs/build.md#cann) | Ascend NPU | | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | +| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE | | [WebGPU [In Progress]](docs/build.md#webgpu) | All | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | diff --git a/docs/backend/zDNN.md b/docs/backend/zDNN.md new file mode 100644 index 0000000000000..8d2e111772217 --- /dev/null +++ b/docs/backend/zDNN.md @@ -0,0 +1,61 @@ +# llama.cpp for IBM zDNN Accelerator + +## Background + +IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations. + +### Llama.cpp + IBM zDNN + +The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library. + +## Software & Hardware Support + +| Hardware Level | Status | Verified | +| -------------------- | ------------- | -------------------------- | +| IBM z17 / LinuxONE 5 | Supported | RHEL 9.6, IBM z17, 40 IFLs | +| IBM z16 / LinuxONE 4 | Not Supported | | + +## Data Types Supported + +| Data Type | Status | +| --------- | --------- | +| F32 | Supported | +| F16 | Supported | +| BF16 | Supported | + +## CMake Options + +The IBM zDNN backend has the following CMake options that control the behaviour of the backend. + +| CMake Option | Default Value | Description | +| ------------ | ------------- | ----------------------------------- | +| `GGML_ZDNN` | `OFF` | Compile llama.cpp with zDNN support | +| `ZDNN_ROOT` | `""` | Override zDNN library lookup | + +## 1. Install zDNN Library + +Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source. + +```sh +git clone --recurse-submodules https://github.com/IBM/zDNN +cd zDNN + +autoreconf . +./configure --prefix=/opt/zdnn-libs + +make build +sudo make install +``` + +## 2. Build llama.cpp + +```sh +git clone https://github.com/ggml-org/llama.cpp +cd llama.cpp + +cmake -S . -G Ninja -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_ZDNN=ON \ + -DZDNN_ROOT=/opt/zdnn-libs +cmake --build build --config Release -j$(nproc) +``` diff --git a/ggml/include/ggml-zdnn.h b/ggml/include/ggml-zdnn.h index 69fb558d873cd..fbf45b6e1c34c 100644 --- a/ggml/include/ggml-zdnn.h +++ b/ggml/include/ggml-zdnn.h @@ -7,6 +7,9 @@ extern "C" { #endif +// device buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void); + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void); #ifdef __cplusplus diff --git a/ggml/src/ggml-zdnn/.gitignore b/ggml/src/ggml-zdnn/.gitignore new file mode 100644 index 0000000000000..8322c0f8e6409 --- /dev/null +++ b/ggml/src/ggml-zdnn/.gitignore @@ -0,0 +1 @@ +zdnn.h diff --git a/ggml/src/ggml-zdnn/common.hpp b/ggml/src/ggml-zdnn/common.hpp new file mode 100644 index 0000000000000..2462ded55b7fc --- /dev/null +++ b/ggml/src/ggml-zdnn/common.hpp @@ -0,0 +1,59 @@ +#ifndef GGML_ZDNN_COMMON_HPP +#define GGML_ZDNN_COMMON_HPP + +#include "ggml.h" +#include "ggml-impl.h" + +#include "zdnn.h" + +#include +#include + +#define GGML_ZDNN_NAME "zDNN" +#define GGML_ZDNN_VERSION ZDNN_VERNUM + +#define ZDNN_CHECK(stmt) \ + do { \ + zdnn_status status = (stmt); \ + GGML_ASSERT(status == ZDNN_OK); \ + } while (0); + +struct ggml_backend_zdnn_device_context { + int zdnn_device; + int zdnn_device_ref_count; + + bool has_parmblkformat_0; + bool has_parmblkformat_1; // checks for z17 + + size_t max_size; + + char name[128]; +}; + +struct ggml_backend_zdnn_context { + int device; + ggml_cgraph * gf; +}; + +struct ggml_backend_zdnn_buffer { + void * data; + ggml_backend_zdnn_buffer * extra; // for bias, etc. + size_t size; + + zdnn_tensor_desc pre_tfm_desc; + zdnn_tensor_desc tfm_desc; + zdnn_ztensor ztensor; + + char name[GGML_MAX_NAME]; +}; + +struct ggml_backend_zdnn_buffer_context { + void * all_data; + size_t all_size; + bool owned; + + int n_buffers; + std::vector> buffers; +}; + +#endif // GGML_ZDNN_COMMON_HPP diff --git a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h deleted file mode 100644 index a4153818158e5..0000000000000 --- a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef GGML_ZDNN_IMPL -#define GGML_ZDNN_IMPL - -#include "zdnn.h" -#include "ggml.h" -#include "ggml-zdnn.h" - -#include -#include -#include - -#define GGML_ZDNN_NAME "zDNN" -#define GGML_ZDNN_VERSION ZDNN_VERNUM - -#define vec_neg(a) (-(a)) // Vector Negate -#define vec_add(a, b) ((a) + (b)) // Vector Add -#define vec_sub(a, b) ((a) - (b)) // Vector Subtract -#define vec_mul(a, b) ((a) * (b)) // Vector Multiply -#define vec_div(a, b) ((a) / (b)) // Vector Divide -#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left -#define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right -#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic -#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet -#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet - -#ifndef vec_and -#define vec_and(a, b) ((a) & (b)) // Vector AND -#endif - -#ifndef vec_or -#define vec_or(a, b) ((a) | (b)) // Vector OR -#endif - -#ifndef vec_xor -#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR -#endif - -typedef signed char char8x16_t __attribute__((vector_size(16))); -typedef unsigned char uchar8x16_t __attribute__((vector_size(16))); - -typedef int8_t int8x16_t __attribute__((vector_size(16))); -typedef int16_t int16x8_t __attribute__((vector_size(16))); -typedef int32_t int32x4_t __attribute__((vector_size(16))); -typedef uint8_t uint8x16_t __attribute__((vector_size(16))); -typedef uint16_t uint16x8_t __attribute__((vector_size(16))); -typedef uint32_t uint32x4_t __attribute__((vector_size(16))); - -typedef float float32x4_t __attribute__((vector_size(16))); -typedef double double64x2_t __attribute__((vector_size(16))); - -typedef signed long long long64x2_t __attribute__((vector_size(16))); -typedef unsigned long long ulong64x2_t __attribute__((vector_size(16))); - -#define ZDNN_CHECK(stmt) \ - do { \ - zdnn_status status = (stmt); \ - GGML_ASSERT(status == ZDNN_OK); \ - } while (0); - -struct ggml_backend_zdnn_device_context { - int zdnn_device; - int zdnn_device_ref_count; - - bool has_parmblkformat_0; - bool has_parmblkformat_1; - - size_t max_size; - - char name[128]; -}; - -struct ggml_backend_zdnn_context { - int device; - ggml_cgraph * gf; -}; - -struct ggml_backend_zdnn_buffer { - void * data; - ggml_backend_zdnn_buffer * extra; // for bias, etc. - size_t size; - - zdnn_tensor_desc pre_tfm_desc; - zdnn_tensor_desc tfm_desc; - zdnn_ztensor ztensor; - - char name[GGML_MAX_NAME]; -}; - -struct ggml_backend_zdnn_buffer_context { - void * all_data; - size_t all_size; - bool owned; - - int n_buffers; - std::vector> buffers; -}; - -#endif // GGML_ZDNN_IMPL diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 57a8f266201b5..edbeb8eef2458 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -1,188 +1,39 @@ -#include "zdnn.h" #include "ggml-zdnn.h" -#include "ggml-zdnn-impl.h" - #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-zdnn/common.hpp" +#include "ggml-zdnn/mmf.hpp" +#include "ggml-zdnn/utils.hpp" +#include "ggml.h" + #include #include -#include +#include // raise(SIGTRAP) #include -inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) { - switch (type) { - case GGML_TYPE_F32: - return FP32; - case GGML_TYPE_F16: - return FP16; - case GGML_TYPE_BF16: - return BFLOAT; - case GGML_TYPE_I8: - return INT8; - case GGML_TYPE_I32: - return INT32; - case GGML_TYPE_Q8_0: - return INT8; - default: - GGML_ABORT("%s: fatal: unable to determine zTensor data type", - __func__); - break; - } -} +static void ggml_zdnn_compute_forward_mul_mat( + const ggml_backend_zdnn_context * ctx, + ggml_tensor * dst) { -inline void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc, - zdnn_tensor_desc & tfm_desc, - zdnn_ztensor & ztensor, - const ggml_tensor * src, - const int64_t * ne, - const zdnn_data_layouts layout) { - zdnn_init_pre_transformed_desc( - layout, - ggml_zdnn_type_mapping(src->type), - &pre_tfm_desc, - ne[3], ne[2], ne[1], ne[0] - ); + const ggml_tensor * src0 = dst->src[0]; // weights + const ggml_tensor * src1 = dst->src[1]; // inputs - ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc)); - ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor)); + // TODO: implement support for quantized types + // we currently only support f32, f16, and bf16 + ggml_zdnn_mul_mat_f(ctx, src0, src1, dst); } -inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, - void * buffer) { - ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer)); -} +static bool ggml_zdnn_compute_forward( + ggml_backend_zdnn_context * ctx, + ggml_tensor * dst) { -inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) { - switch (tensor->op) { + switch (dst->op) { case GGML_OP_MUL_MAT: { - zdnn_init_pre_transformed_desc( - ZDNN_2D, - ggml_zdnn_type_mapping(tensor->type), - &buffer->pre_tfm_desc, - tensor->ne[1], tensor->ne[0] - ); + ggml_zdnn_compute_forward_mul_mat(ctx, dst); } break; - default: - { - // For 4D tensors, GGML uses NCHW layout. However, because zDNN - // automatically transforms everything to NHWC, we will use it - // directly to avoid the performance penalty changing the - // layout and reshaping the tensor. - zdnn_init_pre_transformed_desc( - ZDNN_NHWC, - ggml_zdnn_type_mapping(tensor->type), - &buffer->pre_tfm_desc, - tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0] - ); - - // TODO: Consider adding a ggml check. - // TODO: If tensor = 4D, use ZDNN_NCHW by default. - // TODO: If tensor = 2D, use ZDNN_NHWC by default. - } break; - } - - ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc)); - ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor)); -} - -static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_TENSOR_BINARY_OP_LOCALS; - - const enum ggml_type type = src0->type; - - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - - // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); - GGML_ASSERT(nb10 == ggml_type_size(src1->type)); - - // dst cannot be transposed or permuted - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb0 <= nb1); - GGML_ASSERT(nb1 <= nb2); - GGML_ASSERT(nb2 <= nb3); - - const ggml_tensor * weights = src0; - const ggml_tensor * inputs = src1; - ggml_tensor * output = dst; - - ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra; - ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra; - ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra; - ggml_backend_zdnn_buffer * bias_extra = (ggml_backend_zdnn_buffer *)output_extra->extra; - - const int64_t weights_rows = ne01; - const int64_t weights_cols = ne00; - const int64_t inputs_rows = ne11; - const int64_t inputs_cols = ne10; - - assert(inputs_cols == weights_cols); - - const int64_t output_rows = ne1; - const int64_t output_cols = ne0; - - // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n", - // __func__, weights_extra->name, - // weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0], - // weights_extra->pre_tfm_desc.dim1, - // weights_extra->pre_tfm_desc.dim2, - // weights_extra->pre_tfm_desc.dim3, - // weights_extra->pre_tfm_desc.dim4); - - // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n", - // __func__, inputs_extra->name, - // inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0], - // inputs_extra->pre_tfm_desc.dim1, - // inputs_extra->pre_tfm_desc.dim2, - // inputs_extra->pre_tfm_desc.dim3, - // inputs_extra->pre_tfm_desc.dim4); - - GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]"); - GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]"); - GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]"); - GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]"); - - ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor, - false, true, MATMUL_OP_ADDITION, &output_extra->ztensor)); - // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient. - ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data)); - - GGML_UNUSED(ctx); - GGML_UNUSED(weights_rows); - GGML_UNUSED(weights_cols); - GGML_UNUSED(inputs_rows); - GGML_UNUSED(inputs_cols); - GGML_UNUSED(output_rows); - GGML_UNUSED(output_cols); -} - -static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // debug helpers - // GGML_LOG_INFO("%s: use_mul_mat_vec = %d\n", __func__, use_mul_mat_vec); - // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q); - // GGML_LOG_INFO("%s: use_mul_mat_q = %d\n", __func__, use_mul_mat_q); - // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); - // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); - // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); - // GGML_LOG_INFO("%s: %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); - // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); - // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - - ggml_zdnn_mul_mat_op(ctx, src0, src1, dst); -} - -static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) { - switch (dst->op) { - case GGML_OP_MUL_MAT: - ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst); - break; - default: return false; } diff --git a/ggml/src/ggml-zdnn/mmf.cpp b/ggml/src/ggml-zdnn/mmf.cpp new file mode 100644 index 0000000000000..3ac9cf3c931e3 --- /dev/null +++ b/ggml/src/ggml-zdnn/mmf.cpp @@ -0,0 +1,80 @@ +#include "ggml.h" +#include "mmf.hpp" + +void ggml_zdnn_mul_mat_f( + const ggml_backend_zdnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS; + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + const ggml_tensor * weights = src0; + const ggml_tensor * inputs = src1; + ggml_tensor * output = dst; + + ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra; + ggml_backend_zdnn_buffer * inputs_extra = (ggml_backend_zdnn_buffer *)inputs->extra; + ggml_backend_zdnn_buffer * output_extra = (ggml_backend_zdnn_buffer *)output->extra; + ggml_backend_zdnn_buffer * bias_extra = (ggml_backend_zdnn_buffer *)output_extra->extra; + + const int64_t weights_rows = ne01; + const int64_t weights_cols = ne00; + const int64_t inputs_rows = ne11; + const int64_t inputs_cols = ne10; + + assert(inputs_cols == weights_cols); + + const int64_t output_rows = ne1; + const int64_t output_cols = ne0; + + // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n", + // __func__, weights_extra->name, + // weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0], + // weights_extra->pre_tfm_desc.dim1, + // weights_extra->pre_tfm_desc.dim2, + // weights_extra->pre_tfm_desc.dim3, + // weights_extra->pre_tfm_desc.dim4); + + // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n", + // __func__, inputs_extra->name, + // inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0], + // inputs_extra->pre_tfm_desc.dim1, + // inputs_extra->pre_tfm_desc.dim2, + // inputs_extra->pre_tfm_desc.dim3, + // inputs_extra->pre_tfm_desc.dim4); + + GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]"); + GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]"); + GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1 == inputs->ne[0] && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]"); + GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2 == inputs->ne[1] && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]"); + + ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &bias_extra->ztensor, + false, true, MATMUL_OP_ADDITION, &output_extra->ztensor)); + // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient. + ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data)); + + GGML_UNUSED(ctx); + GGML_UNUSED(weights_rows); + GGML_UNUSED(weights_cols); + GGML_UNUSED(inputs_rows); + GGML_UNUSED(inputs_cols); + GGML_UNUSED(output_rows); + GGML_UNUSED(output_cols); +} diff --git a/ggml/src/ggml-zdnn/mmf.hpp b/ggml/src/ggml-zdnn/mmf.hpp new file mode 100644 index 0000000000000..a12f1b8f8a0ee --- /dev/null +++ b/ggml/src/ggml-zdnn/mmf.hpp @@ -0,0 +1,12 @@ +#ifndef GGML_ZDNN_MMF_HPP +#define GGML_ZDNN_MMF_HPP + +#include "common.hpp" + +void ggml_zdnn_mul_mat_f( + const ggml_backend_zdnn_context * ctx, + const ggml_tensor * src0, + const ggml_tensor * src1, + ggml_tensor * dst); + +#endif // GGML_ZDNN_MMF_HPP diff --git a/ggml/src/ggml-zdnn/utils.cpp b/ggml/src/ggml-zdnn/utils.cpp new file mode 100644 index 0000000000000..2977cb0fe3bdf --- /dev/null +++ b/ggml/src/ggml-zdnn/utils.cpp @@ -0,0 +1,79 @@ +#include "ggml.h" +#include "utils.hpp" + +zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return FP32; + case GGML_TYPE_F16: + return FP16; + case GGML_TYPE_BF16: + return BFLOAT; + case GGML_TYPE_Q8_0: + return INT8; + case GGML_TYPE_I8: + return INT8; + case GGML_TYPE_I32: + return INT32; + default: + GGML_ABORT("%s: fatal: unable to determine zTensor data type", + __func__); + break; + } +} + +void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc, + zdnn_tensor_desc & tfm_desc, + zdnn_ztensor & ztensor, + const ggml_tensor * src, + const int64_t * ne, + const zdnn_data_layouts layout) { + zdnn_init_pre_transformed_desc( + layout, + ggml_zdnn_type_mapping(src->type), + &pre_tfm_desc, + ne[3], ne[2], ne[1], ne[0] + ); + + ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc)); + ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor)); +} + +void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer) { + ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer)); +} + +void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + { + zdnn_init_pre_transformed_desc( + ZDNN_2D, + ggml_zdnn_type_mapping(tensor->type), + &buffer->pre_tfm_desc, + tensor->ne[1], tensor->ne[0] + ); + } break; + + default: + { + // For 4D tensors, GGML uses NCHW layout. However, because zDNN + // automatically transforms everything to NHWC, we will use it + // directly to avoid the performance penalty changing the + // layout and reshaping the tensor. + zdnn_init_pre_transformed_desc( + ZDNN_NHWC, + ggml_zdnn_type_mapping(tensor->type), + &buffer->pre_tfm_desc, + tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0] + ); + + // TODO: Consider adding a ggml check. + // TODO: If tensor = 4D, use ZDNN_NCHW by default. + // TODO: If tensor = 2D, use ZDNN_NHWC by default. + } break; + } + + ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc)); + ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor)); +} diff --git a/ggml/src/ggml-zdnn/utils.hpp b/ggml/src/ggml-zdnn/utils.hpp new file mode 100644 index 0000000000000..c1e2028edbca7 --- /dev/null +++ b/ggml/src/ggml-zdnn/utils.hpp @@ -0,0 +1,19 @@ +#ifndef GGML_ZDNN_UTILITIES_HPP +#define GGML_ZDNN_UTILITIES_HPP + +#include "common.hpp" + +zdnn_data_types ggml_zdnn_type_mapping(ggml_type type); + +void ggml_zdnn_create_tensor(zdnn_tensor_desc & pre_tfm_desc, + zdnn_tensor_desc & tfm_desc, + zdnn_ztensor & ztensor, + const ggml_tensor * src, + const int64_t * ne, + const zdnn_data_layouts layout); + +void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor, void * buffer); + +void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor); + +#endif // GGML_ZDNN_UTILITIES_HPP From f6b4af3d04763b1e0130f5b5fce19c4bc6f83f1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 23 Sep 2025 10:25:20 +0200 Subject: [PATCH 7/8] ggml : fix uninitialized is_on_grid in quantize_row_iq3_xxs_impl (#15928) * fix uninitialized is_on_grid in quantize_row_iq3_xxs_impl * change initialization to true --- ggml/src/ggml-quants.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 727932123e41b..de5cbd75e868e 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -3721,6 +3721,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT } float best = 0; float scale = max/(2*kMaxQ-1); + for (int k = 0; k < 8; ++k) is_on_grid[k] = true; for (int is = -15; is <= 15; ++is) { float id = (2*kMaxQ-1+is*0.2f)/max; float this_scale = 1/id; From 4e29084ba4104c4ea529fd3163bb6e76f64383df Mon Sep 17 00:00:00 2001 From: Xiangyan Sun Date: Tue, 23 Sep 2025 01:58:12 -0700 Subject: [PATCH 8/8] ggml-cpu: Respect cpumask settings (#16164) --- ggml/src/ggml-cpu/ggml-cpu.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index c131290849538..dbc07301b296e 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -473,10 +473,10 @@ struct ggml_threadpool { struct ggml_compute_state { #ifndef GGML_USE_OPENMP ggml_thread_t thrd; - bool cpumask[GGML_MAX_N_THREADS]; int last_graph; bool pending; #endif + bool cpumask[GGML_MAX_N_THREADS]; struct ggml_threadpool * threadpool; int ith; }; @@ -3081,7 +3081,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( threadpool->workers = workers; -#ifndef GGML_USE_OPENMP +#ifdef GGML_USE_OPENMP + int32_t cpumask_iter = 0; + + // Compute CPU masks for each thread + for (int j = 0; j < tpp->n_threads; j++) { + ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); + } +#else // GGML_USE_OPENMP ggml_mutex_init(&threadpool->mutex); ggml_cond_init(&threadpool->cond); @@ -3154,7 +3161,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); } - ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]); + // Apply thread CPU mask and priority + int ith = omp_get_thread_num(); + + ggml_thread_apply_priority(threadpool->prio); + if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) { + ggml_thread_apply_affinity(threadpool->workers[ith].cpumask); + } + ggml_graph_compute_thread(&threadpool->workers[ith]); } } else { atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);