diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ee76d1799e6f4..867a589ce1648 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -839,12 +839,12 @@ jobs: -DGGML_CUDA=ON cmake --build build - windows-2019-cmake-cuda: - runs-on: windows-2019 + windows-2022-cmake-cuda: + runs-on: windows-2022 strategy: matrix: - cuda: ['12.4', '11.7'] + cuda: ['12.4'] steps: - name: Clone @@ -878,7 +878,7 @@ jobs: env: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DLLAMA_BUILD_SERVER=ON ^ -DGGML_NATIVE=OFF ^ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 65ed244657e4f..9874736cbd8de 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -131,8 +131,9 @@ jobs: include: - build: 'x64' os: ubuntu-22.04 - - build: 'arm64' - os: ubuntu-22.04-arm + # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm + # - build: 'arm64' + # os: ubuntu-22.04-arm runs-on: ${{ matrix.os }} @@ -159,6 +160,9 @@ jobs: id: cmake_build run: | cmake -B build \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -207,6 +211,9 @@ jobs: id: cmake_build run: | cmake -B build \ + -DGGML_BACKEND_DL=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_CPU_ALL_VARIANTS=ON \ -DGGML_VULKAN=ON \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -373,11 +380,11 @@ jobs: name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip windows-cuda: - runs-on: windows-2019 + runs-on: windows-2022 strategy: matrix: - cuda: ['12.4', '11.7'] + cuda: ['12.4'] steps: - name: Clone @@ -405,7 +412,7 @@ jobs: id: cmake_build shell: cmd run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 cmake -S . -B build -G "Ninja Multi-Config" ^ -DGGML_BACKEND_DL=ON ^ -DGGML_NATIVE=OFF ^ diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 4baf6f6c755ee..f6da488576937 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -180,7 +180,7 @@ jobs: server-windows: - runs-on: windows-2019 + runs-on: windows-2022 steps: - name: Clone diff --git a/README.md b/README.md index 576332bc5d33d..91401fa98033c 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ---- +## Quick start + +Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine: + +- Install `llama.cpp` using [brew, nix or winget](docs/install.md) +- Run with Docker - see our [Docker documentation](docs/docker.md) +- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases) +- Build from source by cloning this repository - check out [our build guide](docs/build.md) + +Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more. + +Example command: + +```sh +# Use a local model file +llama-cli -m my_model.gguf + +# Or download and run a model directly from Hugging Face +llama-cli -hf ggml-org/gemma-3-1b-it-GGUF + +# Launch OpenAI-compatible API server +llama-server -hf ggml-org/gemma-3-1b-it-GGUF +``` + ## Description The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide @@ -230,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo + ## Supported backends | Backend | Target devices | @@ -246,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | -## Building the project - -The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h). -The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries: - -- Clone this repository and build locally, see [how to build](docs/build.md) -- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md) -- Use a Docker image, see [documentation for Docker](docs/docker.md) -- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases) - ## Obtaining and quantizing models The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: @@ -263,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt - [Trending](https://huggingface.co/models?library=gguf&sort=trending) - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) -You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. +You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf /[:quant]`. For example: + +```sh +llama-cli -hf ggml-org/gemma-3-1b-it-GGUF +``` By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`. diff --git a/docs/build.md b/docs/build.md index 32717a793ffad..680b0d8398741 100644 --- a/docs/build.md +++ b/docs/build.md @@ -1,5 +1,9 @@ # Build llama.cpp locally +The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h). + +The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. + **To get the Code:** ```bash diff --git a/docs/install.md b/docs/install.md index 4971c18281cc9..7200bf9b7b91d 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,28 +1,42 @@ # Install pre-built version of llama.cpp -## Homebrew +| Install via | Windows | Mac | Linux | +|-------------|---------|-----|-------| +| Winget | ✅ | | | +| Homebrew | | ✅ | ✅ | +| MacPorts | | ✅ | | +| Nix | | ✅ | ✅ | -On Mac and Linux, the homebrew package manager can be used via +## Winget (Windows) + +```sh +winget install llama.cpp +``` + +The package is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/issues/8188 + +## Homebrew (Mac and Linux) ```sh brew install llama.cpp ``` + The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668 -## MacPorts +## MacPorts (Mac) ```sh sudo port install llama.cpp ``` -see also: https://ports.macports.org/port/llama.cpp/details/ -## Nix +See also: https://ports.macports.org/port/llama.cpp/details/ -On Mac and Linux, the Nix package manager can be used via +## Nix (Mac and Linux) ```sh nix profile install nixpkgs#llama-cpp ``` + For flake enabled installs. Or @@ -34,13 +48,3 @@ nix-env --file '' --install --attr llama-cpp For non-flake enabled installs. This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). - -## Flox - -On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via - -```sh -flox install llama-cpp -``` - -Flox follows the nixpkgs build of llama.cpp. diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index d8de7531b0e5f..08facb6d03d5e 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8132,8 +8132,8 @@ static void ggml_compute_forward_rwkv_wkv6_f32( #define WKV_VECTOR_SIZE 4 #endif - int wkv_vector_size; #ifdef WKV_VECTOR_SIZE + int wkv_vector_size; #if defined(__ARM_FEATURE_SVE) wkv_vector_size = svcntw(); #else @@ -8348,8 +8348,8 @@ static void ggml_compute_forward_gla_f32( #define GLA_VECTOR_SIZE 4 #endif - int gla_vector_size; #ifdef GLA_VECTOR_SIZE + int gla_vector_size; #if defined(__ARM_FEATURE_SVE) gla_vector_size = svcntw(); #else diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 925f39e890db9..e230f6d494d77 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -652,9 +652,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( float KQ_max_scale[cols_per_thread]; #pragma unroll for (int col = 0; col < cols_per_thread; ++col) { - KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]); + const float KQ_max_diff = KQ_max[col] - KQ_max_new[col]; + KQ_max_scale[col] = expf(KQ_max_diff); KQ_max[col] = KQ_max_new[col]; + *((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD; + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col]; } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 41d20aa5d8847..a4026f88fe906 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -396,6 +396,7 @@ struct vk_device_struct { vk_pipeline pipeline_count_equal_i32; vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; + vk_pipeline pipeline_conv_transpose_1d_f32; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; @@ -444,7 +445,7 @@ struct vk_device_struct { // for GGML_VK_PERF_LOGGER std::unique_ptr perf_logger; vk::QueryPool query_pool; - uint32_t num_queries; + int32_t num_queries; ~vk_device_struct() { VK_LOG_DEBUG("destroy device " << name); @@ -706,6 +707,21 @@ struct vk_op_timestep_embedding_push_constants { uint32_t max_period; }; +struct vk_op_conv_transpose_1d_push_constants { + uint32_t Cout; + uint32_t Cin; + uint32_t K; + uint32_t L; + uint32_t KL; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb11; + uint32_t nb1; + + int32_t s0; +}; + struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; @@ -2726,6 +2742,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -6392,6 +6410,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_timestep_embedding_f32; } return nullptr; + case GGML_OP_CONV_TRANSPOSE_1D: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_conv_transpose_1d_f32; + } + return nullptr; case GGML_OP_POOL_2D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pool2d_f32; @@ -6726,6 +6749,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co uint32_t half_ceil = (dim + 1) / 2; elements = { half_ceil, (uint32_t)src0->ne[0], 1 }; } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} + } break; case GGML_OP_POOL_2D: { const uint32_t N = dst->ne[3]; @@ -7529,6 +7556,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context }, dryrun); } +static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + // src0: (K, Cout, Cin, 1) -- kernel + // src1: (L, Cin, 1, 1) -- input + // dst: (*, Cout, 1, 1) + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + const int32_t s0 = dst->op_params[0]; + + vk_op_conv_transpose_1d_push_constants p{}; + p.Cout = static_cast(ne01); + p.Cin = static_cast(ne02); + p.K = static_cast(ne00); + p.L = static_cast(ne10); + p.KL = static_cast(ne0); + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb11 = static_cast(nb11 / nb10); + p.nb1 = static_cast(nb1 / nb0); + p.s0 = static_cast(s0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun); +} + static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; @@ -8600,6 +8658,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: @@ -8664,6 +8723,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_LEAKY_RELU: @@ -8835,6 +8895,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_CONV_TRANSPOSE_1D: + ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun); + break; case GGML_OP_POOL_2D: ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun); @@ -8963,6 +9027,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_POOL_2D: case GGML_OP_CONV_2D_DW: case GGML_OP_RWKV_WKV6: @@ -9513,8 +9578,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (ctx->device->query_pool) { ctx->device->device.destroyQueryPool(ctx->device->query_pool); } - VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO }; - query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP; + vk::QueryPoolCreateInfo query_create_info; + query_create_info.queryType = vk::QueryType::eTimestamp; query_create_info.queryCount = cgraph->n_nodes + 100; ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info); ctx->device->num_queries = query_create_info.queryCount; @@ -9600,7 +9665,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); - ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait); + VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); for (int i = 0; i < cgraph->n_nodes; i++) { if (!ggml_vk_is_empty(cgraph->nodes[i])) { ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod)); @@ -10024,6 +10089,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_LEAKY_RELU: case GGML_OP_OPT_STEP_ADAMW: return true; + case GGML_OP_CONV_TRANSPOSE_1D: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: return false; } @@ -10515,6 +10582,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { const int32_t dim = tensor->op_params[0]; const int32_t max_period = tensor->op_params[1]; tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period); + } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){ + const int32_t s0 = tensor->op_params[0]; + const int32_t p0 = tensor->op_params[1]; + const int32_t d0 = tensor->op_params[2]; + tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); } else if (tensor->op == GGML_OP_POOL_2D) { enum ggml_op_pool op = static_cast(tensor->op_params[0]); const int32_t k0 = tensor->op_params[1]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp new file mode 100644 index 0000000000000..b17b4e83eec4b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp @@ -0,0 +1,98 @@ +#version 450 + +#include "types.comp" + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // src0 - kernel: [K, Cout, Cin] +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; // src1 - input: [L, Cin] +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; // dst - result [KL, Cout] + +layout(local_size_x = 128 , local_size_y = 1, local_size_z = 1) in; + +layout (push_constant) uniform parameter { + uint32_t Cout; + uint32_t Cin; + uint32_t K; + uint32_t L; + uint32_t KL; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb11; + uint32_t nb1; + + int32_t s0; +} p; + + +uint32_t Cout_idx = gl_WorkGroupID.x; +const uint32_t bs = gl_WorkGroupSize.x; +uint32_t tid = gl_LocalInvocationID.x; +// Code is more straightforward if we assume it is bs*s0+K instead of (bs-1)*s0+K. +uint32_t tmp_len = bs*p.s0+p.K; +shared D_TYPE tmp[4096]; + +uint splitWork(uint workSize){ + return (bs + workSize -1) / bs; +} + +void main(){ + for(uint32_t i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + if(idx < tmp_len){ + tmp[idx] = 0.0; + } + } + + uint32_t L_blocks = splitWork(p.L); + for(uint32_t L_block_id = 0; L_block_id < L_blocks; L_block_id++){ + if(L_block_id > 0){ + barrier(); + // Shift values in tmp to the current processing window + for(int i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + if(idx >= bs*p.s0 && idx < tmp_len){ + tmp[idx-bs*p.s0] = tmp[idx]; + tmp[idx] = 0.0; + }else if(idx >= p.K && idx < bs*p.s0){ + tmp[idx] = 0.0; + } + } + } + barrier(); + + // Save contributions of the block to tmp + uint32_t L_idx = L_block_id*bs + tid; + for(uint32_t K_idx = 0; K_idx < p.K; K_idx++){ + D_TYPE dp = 0.0; + for(uint32_t Cin_idx = 0; Cin_idx < p.Cin; Cin_idx++){ + A_TYPE elemKrn = data_a[K_idx + Cout_idx * p.nb01 + Cin_idx * p.nb02]; + if(L_idx < p.L){ + B_TYPE elemInp = data_b[L_idx + Cin_idx*p.nb11]; + dp = fma(elemKrn, elemInp, dp); + } + } + tmp[tid*p.s0 + K_idx] += dp; + barrier(); + } + + // Save the computed values except the last block that can have different size + uint32_t KLb_idx = L_block_id*bs*p.s0; + if(L_block_id < L_blocks-1){ + for(uint32_t s0_idx = 0; s0_idx < p.s0; s0_idx++){ + uint32_t sh_idx = p.s0*tid+s0_idx; + uint32_t KL_idx = KLb_idx+sh_idx; + if(KL_idx < p.KL){ + data_d[KL_idx + Cout_idx*p.nb1] = tmp[sh_idx]; + } + } + } + } + + for(uint32_t i = 0; i < splitWork(tmp_len); i++){ + uint32_t idx = i*bs+tid; + uint32_t KL_idx = (L_blocks-1)*bs*p.s0+idx; + if(KL_idx < p.KL){ + data_d[KL_idx + Cout_idx*p.nb1] = tmp[idx]; + } + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 9361e2ac83b0f..c63345ec8b4b6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -622,6 +622,8 @@ void process_shaders() { string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4ab5743879400..7c1a642c19464 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -429,22 +429,54 @@ const llama_kv_cache * llama_context::get_kv_self() const { return kv_self; } -bool llama_context::kv_self_update() { +void llama_context::kv_self_defrag_sched() { + if (!memory) { + return; + } + + memory_force_optimize = true; +} + +bool llama_context::kv_self_update(bool optimize) { if (!memory) { return false; } llama_kv_cache * kv_self = static_cast(memory.get()); - if (!kv_self->update(*this)) { - // no updates have been performed - return false; + { + // TODO: remove in the future + optimize |= memory_force_optimize; + memory_force_optimize = false; + + const auto kv_state = kv_self->init_update(this, optimize); + switch (kv_state->get_status()) { + case LLAMA_MEMORY_STATUS_SUCCESS: + { + // noop + } break; + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + // no updates need to be performed + return false; + } + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + LLAMA_LOG_ERROR("%s: failed to prepare memory update\n", __func__); + return false; + } + } + + if (!kv_state->apply()) { + LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__); + } } // if the KV cache did any computation, we have to reserve a new worst-case graph const auto kv_state = kv_self->init_full(); if (!kv_state) { - throw std::runtime_error("failed to initialize KV cache"); + throw std::runtime_error("failed to initialize memory state"); } const uint32_t n_seqs = cparams.n_seq_max; @@ -452,7 +484,7 @@ bool llama_context::kv_self_update() { auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get()); if (!gf) { - LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__); + LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__); } return true; @@ -940,13 +972,13 @@ int llama_context::decode(llama_batch & inp_batch) { n_outputs_all = 1; } + bool did_optimize = false; + // handle any pending defrags/shifts - kv_self_update(); + kv_self_update(false); llama_memory_state_ptr kv_state; - bool did_defrag = false; - while (true) { kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); if (!kv_state) { @@ -957,25 +989,32 @@ int llama_context::decode(llama_batch & inp_batch) { case LLAMA_MEMORY_STATUS_SUCCESS: { } break; + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status()); + + return -2; + } case LLAMA_MEMORY_STATUS_FAILED_PREPARE: { - if (!did_defrag) { - did_defrag = true; + if (!did_optimize) { + did_optimize = true; - kv_self->defrag_sched(-1.0f); - if (kv_self_update()) { - LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens); + if (kv_self_update(true)) { + LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, batch.n_tokens); continue; } } - LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens); + LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch.n_tokens); return 1; } case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: { + LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch.n_tokens); + return -2; } } @@ -1189,11 +1228,6 @@ int llama_context::decode(llama_batch & inp_batch) { // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); - // decide if we need to defrag the kv cache - if (cparams.defrag_thold > 0.0f) { - kv_self->defrag_sched(cparams.defrag_thold); - } - // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation. ggml_backend_sched_reset(sched.get()); @@ -2283,7 +2317,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) { // deprecated void llama_kv_self_update(llama_context * ctx) { - ctx->kv_self_update(); + ctx->kv_self_update(false); } enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { @@ -2538,13 +2572,8 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) { // deprecated void llama_kv_self_defrag(llama_context * ctx) { - auto * kv = ctx->get_kv_self(); - if (!kv) { - return; - } - // force defrag - kv->defrag_sched(-1.0f); + ctx->kv_self_defrag_sched(); } bool llama_kv_self_can_shift(const llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index 3b880286bfd5d..c1c7efb31fe32 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -52,7 +52,8 @@ struct llama_context { // return true of the KV cache was updated // TODO: remove - bool kv_self_update(); + bool kv_self_update(bool optimize); + void kv_self_defrag_sched(); enum llama_pooling_type pooling_type() const; @@ -231,6 +232,9 @@ struct llama_context { std::unique_ptr memory; + // TODO: temporary, until the llama_kv_self_defrag() API is removed + bool memory_force_optimize = false; + // decode output (2-dimensional array: [n_outputs][n_vocab]) size_t logits_size = 0; // capacity (of floats) for logits float * logits = nullptr; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 727e119e334f6..c4bdd66039277 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -769,9 +769,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); if (weight_before_ffn) { - // TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d) - ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens); - repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens] + // repeat cur to [n_embd, n_expert_used, n_tokens] + ggml_tensor * repeated = ggml_repeat_4d(ctx0, cur, n_embd, n_expert_used, n_tokens, 1); cur = ggml_mul(ctx0, repeated, weights); cb(cur, "ffn_moe_weighted", il); } diff --git a/src/llama-kv-cache-recurrent.cpp b/src/llama-kv-cache-recurrent.cpp index 641eab2f316ce..77bd57065ac3d 100644 --- a/src/llama-kv-cache-recurrent.cpp +++ b/src/llama-kv-cache-recurrent.cpp @@ -1,6 +1,7 @@ #include "llama-kv-cache-recurrent.h" #include "llama-impl.h" +#include "llama-io.h" #include "llama-batch.h" #include "llama-model.h" @@ -386,6 +387,13 @@ llama_memory_state_ptr llama_kv_cache_recurrent::init_full() { return std::make_unique(LLAMA_MEMORY_STATUS_SUCCESS, this); } +llama_memory_state_ptr llama_kv_cache_recurrent::init_update(llama_context * lctx, bool optimize) { + GGML_UNUSED(lctx); + GGML_UNUSED(optimize); + + return std::make_unique(LLAMA_MEMORY_STATUS_NO_UPDATE); +} + bool llama_kv_cache_recurrent::prepare(const std::vector & ubatches) { // simply remember the full state because it is very small for this type of cache // TODO: optimize @@ -419,17 +427,6 @@ bool llama_kv_cache_recurrent::prepare(const std::vector & ubatche return success; } -bool llama_kv_cache_recurrent::update(llama_context & lctx) { - GGML_UNUSED(lctx); - // noop - return false; -} - -void llama_kv_cache_recurrent::defrag_sched(float thold) { - GGML_UNUSED(thold); - // noop -} - bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; diff --git a/src/llama-kv-cache-recurrent.h b/src/llama-kv-cache-recurrent.h index a178ae85c146a..b32f258fbffbf 100644 --- a/src/llama-kv-cache-recurrent.h +++ b/src/llama-kv-cache-recurrent.h @@ -52,9 +52,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache { llama_memory_state_ptr init_full() override; - bool update(llama_context & lctx) override; - - void defrag_sched(float thold) override; + llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; bool prepare(const std::vector & ubatches); diff --git a/src/llama-kv-cache-unified-iswa.cpp b/src/llama-kv-cache-unified-iswa.cpp index 0eb0456343546..3aa606c849843 100644 --- a/src/llama-kv-cache-unified-iswa.cpp +++ b/src/llama-kv-cache-unified-iswa.cpp @@ -123,26 +123,16 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch assert(heads_base.size() == heads_swa.size()); - return std::make_unique(LLAMA_MEMORY_STATUS_SUCCESS, + return std::make_unique( this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches)); } llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() { - return std::make_unique(LLAMA_MEMORY_STATUS_SUCCESS, this); + return std::make_unique(this); } -bool llama_kv_cache_unified_iswa::update(llama_context & lctx) { - bool res = false; - - res = res | kv_base->update(lctx); - res = res | kv_swa ->update(lctx); - - return res; -} - -void llama_kv_cache_unified_iswa::defrag_sched(float thold) { - kv_base->defrag_sched(thold); - kv_swa ->defrag_sched(thold); +llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); } bool llama_kv_cache_unified_iswa::get_can_shift() const { @@ -174,26 +164,38 @@ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const { llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {} llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( - llama_memory_status status, - llama_kv_cache_unified_iswa * kv) : status(status) { - state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base())); - state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa ())); + llama_kv_cache_unified_iswa * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS) { + state_base = kv->get_base()->init_full(); + state_swa = kv->get_swa ()->init_full(); + + status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status()); +} + +llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( + llama_kv_cache_unified_iswa * kv, + llama_context * lctx, + bool optimize) : status(LLAMA_MEMORY_STATUS_SUCCESS) { + state_base = kv->get_base()->init_update(lctx, optimize); + state_swa = kv->get_swa ()->init_update(lctx, optimize); + + status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status()); } llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state( - llama_memory_status status, llama_kv_cache_unified_iswa * kv, llama_sbatch sbatch, std::vector heads_base, std::vector heads_swa, std::vector ubatches) - : status(status), - sbatch(std::move(sbatch)), - ubatches(std::move(ubatches)) { - // note: here we copy the ubatches. not sure if this is ideal - state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base(), {}, std::move(heads_base), this->ubatches)); - state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa (), {}, std::move(heads_swa), this->ubatches)); - } + : status(LLAMA_MEMORY_STATUS_SUCCESS), + sbatch(std::move(sbatch)), + ubatches(std::move(ubatches)) { + // note: here we copy the ubatches. not sure if this is ideal + state_base.reset(new llama_kv_cache_unified_state(kv->get_base(), {}, std::move(heads_base), this->ubatches)); + state_swa .reset(new llama_kv_cache_unified_state(kv->get_swa (), {}, std::move(heads_swa), this->ubatches)); + + status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status()); +} llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default; @@ -233,17 +235,18 @@ llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const { const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + return ubatches[i_next]; } const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return state_base.get(); + return static_cast(state_base.get()); } const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa() const { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); - return state_swa.get(); + return static_cast(state_swa.get()); } diff --git a/src/llama-kv-cache-unified-iswa.h b/src/llama-kv-cache-unified-iswa.h index 8b067da038af6..cba5bbe95197e 100644 --- a/src/llama-kv-cache-unified-iswa.h +++ b/src/llama-kv-cache-unified-iswa.h @@ -54,9 +54,7 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache { llama_memory_state_ptr init_full() override; - bool update(llama_context & lctx) override; - - void defrag_sched(float thold) override; + llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; bool get_can_shift() const override; @@ -86,12 +84,16 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { // used to create a full-cache state llama_kv_cache_unified_iswa_state( - llama_memory_status status, llama_kv_cache_unified_iswa * kv); + // used to create an update state + llama_kv_cache_unified_iswa_state( + llama_kv_cache_unified_iswa * kv, + llama_context * lctx, + bool optimize); + // used to create a state from a batch llama_kv_cache_unified_iswa_state( - llama_memory_status status, llama_kv_cache_unified_iswa * kv, llama_sbatch sbatch, std::vector heads_base, @@ -120,7 +122,7 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { const llama_kv_cache_unified_state * get_swa() const; private: - const llama_memory_status status; + llama_memory_status status; //llama_kv_cache_unified_iswa * kv; @@ -131,6 +133,6 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i { std::vector ubatches; - std::unique_ptr state_base; - std::unique_ptr state_swa; + llama_memory_state_ptr state_base; + llama_memory_state_ptr state_swa; }; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index a817154769a32..5354f808cd959 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -1,6 +1,7 @@ #include "llama-kv-cache-unified.h" #include "llama-impl.h" +#include "llama-io.h" #include "llama-model.h" #include "llama-context.h" @@ -149,12 +150,27 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1 = std::numeric_limits::max(); } - for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.pos_in(i, p0, p1)) { - continue; + if (seq_id >= 0) { + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } + + if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) { + if (new_head == cells.size()) { + new_head = i; + } + } } + } else { + // match any sequence + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } + + cells.rm(i); - if (cells.seq_has(i, seq_id) && cells.seq_rm(i, seq_id)) { if (new_head == cells.size()) { new_head = i; } @@ -305,16 +321,49 @@ llama_memory_state_ptr llama_kv_cache_unified::init_batch( return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } - return std::make_unique(LLAMA_MEMORY_STATUS_SUCCESS, + return std::make_unique( this, std::move(sbatch), std::move(heads), std::move(ubatches)); } llama_memory_state_ptr llama_kv_cache_unified::init_full() { - return std::make_unique(LLAMA_MEMORY_STATUS_SUCCESS, this); + return std::make_unique(this); } -std::vector llama_kv_cache_unified::prepare(const std::vector & ubatches) { - std::vector res; +llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) { + bool do_shift = get_has_shift(); + + defrag_info dinfo; + + // see if we need to defrag + { + bool do_defrag = optimize; + + const auto thold = lctx->get_cparams().defrag_thold; + + if (!do_defrag && thold > 0.0f) { + const auto n_kv = cells.used_max_p1(); + + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f; + + if (fragmentation > thold) { + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); + + do_defrag = true; + } + } + + if (do_defrag) { + dinfo = defrag_prepare(lctx->graph_max_nodes()); + } + } + + return std::make_unique(this, lctx, do_shift, std::move(dinfo)); +} + +llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector & ubatches) { + llama_kv_cache_unified::ubatch_heads res; struct state { uint32_t head_old; // old position of the head, before placing the ubatch @@ -359,12 +408,12 @@ std::vector llama_kv_cache_unified::prepare(const std::vectorget_sched(); - if (cells.get_has_shift()) { + if (do_shift) { if (!get_can_shift()) { GGML_ABORT("The current KV cache / model configuration does not support K-shift"); } @@ -375,9 +424,9 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched); - auto * gf = lctx.graph_init(); + auto * gf = lctx->graph_init(); - auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf); + auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf); if (!res) { LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__); return updated; @@ -390,7 +439,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { res->set_inputs(nullptr); - if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) { + if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: failed to compute K-shift\n", __func__); return updated; } @@ -401,54 +450,53 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { cells.reset_shift(); } - if (do_defrag) { + if (!dinfo.empty()) { LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); - if (defrag_prepare(lctx.graph_max_nodes())) { - ggml_backend_sched_reset(sched); - - auto * gf = lctx.graph_init(); + // apply moves: + { + const auto n_kv = dinfo.ids.size(); - auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf); - if (!res) { - LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__); - return updated; - } - - if (!ggml_backend_sched_alloc_graph(sched, gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); - return updated; - } + for (uint32_t i = 0; i < n_kv; ++i) { + assert(dinfo.ids[i] <= n_kv); - res->set_inputs(nullptr); + if (dinfo.ids[i] == n_kv) { + continue; + } - if (lctx.graph_compute(gf, false) != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__); - return updated; + cells.mv(i, dinfo.ids[i]); } - updated = true; + // reset the head so we can find the first free slot during the next ubatch + head = 0; } - do_defrag = false; - } + ggml_backend_sched_reset(sched); - return updated; -} + auto * gf = lctx->graph_init(); -void llama_kv_cache_unified::defrag_sched(float thold) { - const auto n_kv = cells.used_max_p1(); + auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo); + if (!res) { + LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__); + return updated; + } - // - do not defrag small contexts (i.e. < 2048 tokens) - // - count the padding towards the number of used tokens - const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f; + if (!ggml_backend_sched_alloc_graph(sched, gf)) { + LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); + return updated; + } - // queue defragmentation for next llama_kv_cache_update - if (fragmentation > thold) { - LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); + res->set_inputs(nullptr); - do_defrag = true; + if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) { + LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__); + return updated; + } + + updated = true; } + + return updated; } int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const { @@ -597,6 +645,10 @@ uint32_t llama_kv_cache_unified::get_size() const { return cells.size(); } +bool llama_kv_cache_unified::get_has_shift() const { + return cells.get_has_shift(); +} + uint32_t llama_kv_cache_unified::get_n_kv() const { return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))); } @@ -926,12 +978,13 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift( } llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( - const llama_cparams & cparams, - ggml_context * ctx, - ggml_cgraph * gf) const { + const llama_cparams & cparams, + ggml_context * ctx, + ggml_cgraph * gf, + const defrag_info & dinfo) const { auto res = std::make_unique(); - const auto & ids = defrag_info.ids; + const auto & ids = dinfo.ids; #if 0 // CPU defrag @@ -1072,7 +1125,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag( return res; } -bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { +llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const { const uint32_t n_layer = layers.size(); const uint32_t n_kv = cells.used_max_p1(); @@ -1093,14 +1146,9 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer); // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - auto & ids = defrag_info.ids; + defrag_info res; + auto & ids = res.ids; - ids.clear(); ids.resize(n_kv, n_kv); for (uint32_t i0 = 0; i0 < n_used; ++i0) { @@ -1164,11 +1212,6 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { // this cell goes to (i0 + nf) ids[i1] = i0 + nf; - // move the cell meta data - cells.mv(i1, i0 + nf); - - head = n_used; - if (!cont) { n_moves++; cont = true; @@ -1191,14 +1234,14 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { } if (n_moves == 0) { - return false; + return {}; } LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves); LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer); - return true; + return res; } bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const { @@ -1621,24 +1664,27 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {} llama_kv_cache_unified_state::llama_kv_cache_unified_state( - llama_memory_status status, - llama_kv_cache_unified * kv) : status(status), kv(kv) { - n_kv = kv->get_size(); - head = 0; - } + llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) { + n_kv = kv->get_size(); + head = 0; +} llama_kv_cache_unified_state::llama_kv_cache_unified_state( - llama_memory_status status, - llama_kv_cache_unified * kv, - llama_sbatch sbatch, - std::vector heads, - std::vector ubatches) - : status(status), - kv(kv), - sbatch(std::move(sbatch)), - heads(std::move(heads)), - ubatches(std::move(ubatches)) { + llama_kv_cache_unified * kv, + llama_context * lctx, + bool do_shift, + defrag_info dinfo) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)) { + if (!do_shift && dinfo.empty()) { + status = LLAMA_MEMORY_STATUS_NO_UPDATE; } +} + +llama_kv_cache_unified_state::llama_kv_cache_unified_state( + llama_kv_cache_unified * kv, + llama_sbatch sbatch, + llama_kv_cache_unified::ubatch_heads heads, + std::vector ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sbatch(std::move(sbatch)), heads(std::move(heads)), ubatches(std::move(ubatches)) { +} llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default; @@ -1655,6 +1701,13 @@ bool llama_kv_cache_unified_state::next() { bool llama_kv_cache_unified_state::apply() { assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + // no ubatches -> this is a KV cache update + if (ubatches.empty()) { + kv->update(lctx, do_shift, dinfo); + + return true; + } + kv->apply_ubatch(heads[i_next], ubatches[i_next]); n_kv = kv->get_n_kv(); diff --git a/src/llama-kv-cache-unified.h b/src/llama-kv-cache-unified.h index 1f1d44b97c2ac..6ff388a88b1d5 100644 --- a/src/llama-kv-cache-unified.h +++ b/src/llama-kv-cache-unified.h @@ -24,6 +24,19 @@ class llama_kv_cache_unified : public llama_kv_cache { // this callback is used to filter out layers that should not be included in the cache using layer_filter_cb = std::function; + using ubatch_heads = std::vector; + + struct defrag_info { + bool empty() const { + return ids.empty(); + } + + // contains information about which cell moves where: + // - cell i moves to ids[i] + // - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved + std::vector ids; + }; + llama_kv_cache_unified( const llama_model & model, layer_filter_cb && filter, @@ -66,9 +79,7 @@ class llama_kv_cache_unified : public llama_kv_cache { llama_memory_state_ptr init_full() override; - bool update(llama_context & lctx) override; - - void defrag_sched(float thold) override; + llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override; bool get_can_shift() const override; @@ -83,6 +94,8 @@ class llama_kv_cache_unified : public llama_kv_cache { uint32_t get_size() const; + bool get_has_shift() const; + // // graph_build API // @@ -103,7 +116,9 @@ class llama_kv_cache_unified : public llama_kv_cache { // find places for the provided ubatches in the cache, returns the head locations // return empty vector on failure - std::vector prepare(const std::vector & ubatches); + ubatch_heads prepare(const std::vector & ubatches); + + bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo); // return the cell position where we can insert the ubatch // return -1 on failure to find a contiguous slot of kv cells @@ -133,8 +148,7 @@ class llama_kv_cache_unified : public llama_kv_cache { ggml_tensor * v; }; - bool do_defrag = false; - bool v_trans = true; // the value tensor is transposed + bool v_trans = true; // the value tensor is transposed // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot()) // note: this is not part of the KV state and it's only used to speed-up the find_slot() method @@ -160,13 +174,8 @@ class llama_kv_cache_unified : public llama_kv_cache { // model layer id -> KV cache layer id std::unordered_map map_layer_ids; - // defrag - struct { - std::vector ids; - } defrag_info; - - // return true if cells have been moved - bool defrag_prepare(int32_t n_max_nodes); + // return non-empty vector if cells have been moved + defrag_info defrag_prepare(int32_t n_max_nodes) const; size_t total_size() const; @@ -192,7 +201,8 @@ class llama_kv_cache_unified : public llama_kv_cache { llm_graph_result_ptr build_graph_defrag( const llama_cparams & cparams, ggml_context * ctx, - ggml_cgraph * gf) const; + ggml_cgraph * gf, + const defrag_info & dinfo) const; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; @@ -203,20 +213,29 @@ class llama_kv_cache_unified : public llama_kv_cache { class llama_kv_cache_unified_state : public llama_memory_state_i { public: + // some shorthands + using ubatch_heads = llama_kv_cache_unified::ubatch_heads; + using defrag_info = llama_kv_cache_unified::defrag_info; + // used for errors llama_kv_cache_unified_state(llama_memory_status status); // used to create a full-cache state llama_kv_cache_unified_state( - llama_memory_status status, llama_kv_cache_unified * kv); - // used to create a state from a batch + // used to create an update state + llama_kv_cache_unified_state( + llama_kv_cache_unified * kv, + llama_context * lctx, + bool do_shift, + defrag_info dinfo); + + // used to create a decode state from a batch llama_kv_cache_unified_state( - llama_memory_status status, llama_kv_cache_unified * kv, llama_sbatch sbatch, - std::vector heads, + ubatch_heads heads, std::vector ubatches); virtual ~llama_kv_cache_unified_state(); @@ -253,16 +272,30 @@ class llama_kv_cache_unified_state : public llama_memory_state_i { void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const; private: - const llama_memory_status status; + llama_memory_status status; llama_kv_cache_unified * kv; + llama_context * lctx; + + // + // update state + // + + bool do_shift = false; + + defrag_info dinfo; + + // + // batch processing state + // llama_sbatch sbatch; // the index of the next ubatch to process size_t i_next = 0; - std::vector heads; + ubatch_heads heads; + std::vector ubatches; // diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 2d04705f27857..17a5e5cb84903 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -1,12 +1,16 @@ #pragma once #include "llama.h" -#include "llama-io.h" #include "llama-memory.h" +class llama_io_write_i; +class llama_io_read_i; + struct llama_kv_cache : public llama_memory_i { virtual ~llama_kv_cache() = default; + // TODO: move the init_ interfaces to llama_memory_i + // split the input batch into a set of ubatches and verify that they can fit into the cache // return a state object containing the ubatches and KV cache state required to process them // check the llama_memory_state_i::get_status() for the result @@ -19,16 +23,9 @@ struct llama_kv_cache : public llama_memory_i { // simulate full cache, used for allocating worst-case compute buffers virtual llama_memory_state_ptr init_full() = 0; - // process any pending defrag/shift/etc. operations - // optionally call once before processing a new batch - // return true if any operations were performed - virtual bool update(llama_context & lctx) = 0; - - // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing - // TODO: change to - // llama_memory_state_ptr init_defrag(float thold) = 0; - // - virtual void defrag_sched(float thold) = 0; + // prepare for any pending memory updates, such as shifts, defrags, etc. + // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update + virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0; // getters virtual bool get_can_shift() const = 0; diff --git a/src/llama-memory.cpp b/src/llama-memory.cpp index 10173253edfe4..f1107672c6476 100644 --- a/src/llama-memory.cpp +++ b/src/llama-memory.cpp @@ -1 +1,42 @@ #include "llama-memory.h" + +llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) { + bool has_update = false; + + switch (s0) { + case LLAMA_MEMORY_STATUS_SUCCESS: + { + has_update = true; + break; + } + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + break; + } + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + return s0; + } + } + + switch (s1) { + case LLAMA_MEMORY_STATUS_SUCCESS: + { + has_update = true; + break; + } + case LLAMA_MEMORY_STATUS_NO_UPDATE: + { + break; + } + case LLAMA_MEMORY_STATUS_FAILED_PREPARE: + case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: + { + return s1; + } + } + + // if either status has an update, then the combined status has an update + return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE; +} diff --git a/src/llama-memory.h b/src/llama-memory.h index b3799d66e8c17..ab0d399c4eb60 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -36,12 +36,19 @@ class llama_memory_i { virtual bool get_can_edit() const = 0; }; +using llama_memory_ptr = std::unique_ptr; + enum llama_memory_status { LLAMA_MEMORY_STATUS_SUCCESS = 0, + LLAMA_MEMORY_STATUS_NO_UPDATE, LLAMA_MEMORY_STATUS_FAILED_PREPARE, LLAMA_MEMORY_STATUS_FAILED_COMPUTE, }; +// helper function for combining the status of two memory states +// useful for implementing hybrid memory types (e.g. iSWA) +llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1); + // the interface for managing the memory state during batch processing // this interface is implemented per memory type. see: // - llama_kv_cache_unified_state @@ -69,7 +76,7 @@ class llama_memory_state_i { // get the current ubatch virtual const llama_ubatch & get_ubatch() const = 0; - // get the status of the memory state + // get the status of the memory state - used for error handling and checking if any updates would be applied virtual llama_memory_status get_status() const = 0; }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 543db93402190..509a4b35f57cb 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2706,8 +2706,8 @@ struct test_conv_transpose_1d : public test_case { return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0); } - test_conv_transpose_1d(std::array ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1] - std::array ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1] + test_conv_transpose_1d(std::array ne_input = {197, 32, 1, 1}, // [input_width, input_channels, 1 /* assert in cpu kernel*/, 1 (should be batch)] + std::array ne_kernel = {16, 32, 32, 1}, // [kernel_width, output_channels, input_channels, 1 (should be batch)] int s0 = 1, int p0 = 0, int d0 = 1) : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {} @@ -4029,6 +4029,18 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true)); + for(uint32_t Cout : {1, 9}){ + for(uint32_t Cin : {1, 7}){ + for(uint32_t K : {1, 3, 1337}){ + for(uint32_t L : {1, 2, 13}){ + for(uint32_t s0: {1, 2, 3}){ + test_cases.emplace_back(new test_conv_transpose_1d({L,Cin,1,1}, {K,Cout,Cin,1}, s0, 0, 1)); + } + } + } + } + } + test_cases.emplace_back(new test_conv_transpose_1d()); test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));