diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 974dd78a8b08a..0bc4e7ee13f66 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 +ARG CUDA_VERSION=12.4.0 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index 7425d3a9d7a40..3bbf4a4def2a5 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -17,10 +17,10 @@ Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git cuda-toolkit Requires: cuda-toolkit -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggml-org/llama.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 4d5560089816c..45902dcf896e0 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -18,10 +18,10 @@ Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz +Source0: https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz BuildRequires: coreutils make gcc-c++ git libstdc++-devel Requires: libstdc++ -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggml-org/llama.cpp %define debug_package %{nil} %define source_date_epoch_from_changelog 0 diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 043c4364b956a..6e8050a499635 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: { --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; - # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015, # `default.metallib` may be compiled with Metal compiler from XCode # and we need to escape sandbox on MacOS to access Metal compiler. # `xcrun` is used find the path of the Metal compiler, which is varible # and not on $PATH - # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; nativeBuildInputs = @@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { broken = (useMetalKit && !effectiveStdenv.isDarwin); description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}"; - homepage = "https://github.com/ggerganov/llama.cpp/"; + homepage = "https://github.com/ggml-org/llama.cpp/"; license = lib.licenses.mit; # Accommodates `nix run` and `lib.getExe` diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index a8088ea00da5b..48e7e6aaa5b77 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878 # This is mostly tied to rocBLAS supported archs. # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported # gfx906 is deprecated diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml index 02dd4f575a686..cee1446f5a097 100644 --- a/.github/ISSUE_TEMPLATE/020-enhancement.yml +++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas) + [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas) - type: checkboxes id: prerequisites @@ -16,11 +16,11 @@ body: options: - label: I am running the latest code. Mention the version if possible as well. required: true - - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). + - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md). required: true - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). required: true - - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share. + - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share. required: true - type: textarea diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml index 18975dbbfd0fe..e774550d5908c 100644 --- a/.github/ISSUE_TEMPLATE/030-research.yml +++ b/.github/ISSUE_TEMPLATE/030-research.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: | - Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) + Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) - type: checkboxes id: research-stage diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml index b6e6ab36defd6..2fe94e26c6988 100644 --- a/.github/ISSUE_TEMPLATE/040-refactor.yml +++ b/.github/ISSUE_TEMPLATE/040-refactor.yml @@ -6,8 +6,8 @@ body: - type: markdown attributes: value: | - Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. - Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. + Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. + Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. - type: textarea id: background-description diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index eb8c4b472df4c..0d246533c9515 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,11 @@ blank_issues_enabled: true contact_links: - name: Got an idea? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas + url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas about: Pop it there. It may then become an enhancement ticket. - name: Got a question? - url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a + url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a about: Ask a question there! - name: Want to contribute? - url: https://github.com/ggerganov/llama.cpp/wiki/contribute + url: https://github.com/ggml-org/llama.cpp/wiki/contribute about: Head to the contribution guide page of the wiki for areas you can help with diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d9f5bdc235a00..d0bdd73c4439c 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1 +1 @@ -*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* +*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled index 1c8787ef78f7e..0370c8943fa0e 100644 --- a/.github/workflows/bench.yml.disabled +++ b/.github/workflows/bench.yml.disabled @@ -1,5 +1,5 @@ # TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggerganov/llama.cpp/issues/7893 +# https://github.com/ggml-org/llama.cpp/issues/7893 # # Benchmark name: Benchmark @@ -57,17 +57,7 @@ jobs: if: | inputs.gpu-series == 'Standard_NC4as_T4_v3' - || ( - github.event_name == 'schedule' - && github.ref_name == 'master' - && github.repository_owner == 'ggerganov' - ) || github.event_name == 'pull_request_target' - || ( - github.event_name == 'push' - && github.event.ref == 'refs/heads/master' - && github.repository_owner == 'ggerganov' - ) steps: - name: Clone id: checkout diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 62f4ed8742778..b96e1f50acc9e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -129,7 +129,7 @@ jobs: run: | sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 cmake -B build \ -DCMAKE_BUILD_RPATH="@loader_path" \ -DLLAMA_FATAL_WARNINGS=ON \ @@ -173,7 +173,15 @@ jobs: name: llama-bin-macos-x64.zip ubuntu-cpu-cmake: - runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - build: 'x64' + os: ubuntu-22.04 + - build: 'arm64' + os: ubuntu-22.04-arm + + runs-on: ${{ matrix.os }} steps: - name: Clone @@ -239,14 +247,14 @@ jobs: run: | cp LICENSE ./build/bin/ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip - name: llama-bin-ubuntu-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip + name: llama-bin-ubuntu-${{ matrix.build }}.zip ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -374,6 +382,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 @@ -401,7 +411,7 @@ jobs: run: | cd build # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 1800 + ctest -L main --verbose --timeout 2700 - name: Determine tag name id: tag @@ -1373,8 +1383,10 @@ jobs: needs: - ubuntu-cpu-cmake + - ubuntu-22-cmake-vulkan - windows-latest-cmake - windows-2019-cmake-cuda + - windows-latest-cmake-sycl - windows-latest-cmake-hip-release - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 6955a7dc8234d..c81d21fcd7e67 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -51,6 +51,8 @@ jobs: - name: Set up QEMU uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:qemu-v7.0.0-28 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 368dbdbe5dccc..0b0f300aa402a 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -11,7 +11,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - repository: "ggerganov/llama.cpp" + repository: "ggml-org/llama.cpp" - uses: actions/labeler@v5 with: configuration-path: '.github/labeler.yml' diff --git a/.gitignore b/.gitignore index 694f36e042fb5..56b5ac2c18cfe 100644 --- a/.gitignore +++ b/.gitignore @@ -98,6 +98,7 @@ examples/server/*.css.hpp examples/server/*.html.hpp examples/server/*.js.hpp examples/server/*.mjs.hpp +examples/server/*.gz.hpp !build_64.sh !examples/*.bat !examples/*/*.kts diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d411982b4379..8fab0de6f14db 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,12 @@ # Pull requests (for contributors) +- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier - Test your changes: - Execute [the full CI locally on your machine](ci/README.md) before publishing - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`) - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends) - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops` +- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly - If your PR becomes stale, don't hesitate to ping the maintainers in the comments @@ -12,7 +14,7 @@ - Squash-merge PRs - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` -- Optionally pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules +- Optionally pick a `` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules - Consider adding yourself to [CODEOWNERS](CODEOWNERS) # Coding guidelines @@ -40,14 +42,14 @@ - Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices -- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ +- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ ![matmul](media/matmul.png) # Naming guidelines - Use `snake_case` for function, variable and type names -- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) +- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963) ```cpp // not OK @@ -122,4 +124,4 @@ The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: -https://github.com/ggerganov/llama.cpp/projects +https://github.com/ggml-org/llama.cpp/projects diff --git a/Makefile b/Makefile index dc3de3cb14e44..5339d490b4e68 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ifndef LLAMA_MAKEFILE -$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) +$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) endif # Define the default target now so that it is always the first target @@ -463,7 +463,7 @@ endif ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 - # https://github.com/ggerganov/llama.cpp/issues/2922 + # https://github.com/ggml-org/llama.cpp/issues/2922 MK_CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move @@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN) endif # GGML_CUDA_CCBIN +ifdef GGML_CUDA_NO_FA + MK_NVCCFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + ifdef GGML_CUDA_FA_ALL_QUANTS MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS @@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY +ifdef GGML_CUDA_NO_FA + HIPFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) @@ -847,7 +855,7 @@ ifdef GGML_MUSA CXX := $(MUSA_PATH)/bin/clang++ MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc - MUSAFLAGS = -x musa -mtgpu + MUSAFLAGS = -fsigned-char -x musa -mtgpu MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch)) ifdef GGML_CUDA_FORCE_MMQ @@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY +ifdef GGML_CUDA_NO_FA + MUSAFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + ifdef GGML_CUDA_FA_ALL_QUANTS MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS @@ -1078,8 +1090,8 @@ endif ifdef REMOVE_WARNING $(info !!! REMOVAL WARNING !!!) $(info The following LLAMA_ options have been removed and are no longer supported) -$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418)) -$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418)) +$(info - LLAMA_DISABLE_LOGS (https://github.com/ggml-org/llama.cpp/pull/9418)) +$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418)) $(info ) endif @@ -1364,7 +1376,7 @@ llama-server: \ examples/server/index.html.hpp \ examples/server/loading.html.hpp \ common/chat.cpp \ - common/chat.hpp \ + common/chat.h \ common/chat-template.hpp \ common/json.hpp \ common/minja.hpp \ diff --git a/README.md b/README.md index 7629647d7e420..f70c8ae1ed851 100644 --- a/README.md +++ b/README.md @@ -3,26 +3,33 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) -[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) +[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +> [!IMPORTANT] +> New `llama.cpp` package location: [ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp/pkgs/container/llama.cpp) +> +> Update your container URLs to: `ghcr.io/ggml-org/llama.cpp` +> +> More info: https://github.com/ggml-org/llama.cpp/discussions/11801 + ## Recent API changes -- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289) -- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291) +- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289) +- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291) ## Hot topics -- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427 +- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode -- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639 +- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim -- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123 -- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669 -- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123 +- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 +- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) ---- @@ -39,7 +46,7 @@ range of hardware - locally and in the cloud. - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
Models @@ -59,23 +66,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423) +- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) -- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) +- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) -- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) -- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) +- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417) +- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [X] [StableLM models](https://huggingface.co/stabilityai) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) +- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557) - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) -- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003) +- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003) - [x] [GPT-2](https://huggingface.co/gpt2) -- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) +- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118) - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) @@ -146,7 +153,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) -- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) +- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) @@ -245,7 +252,7 @@ The project also includes many example programs and tools using the `llama` libr - Clone this repository and build locally, see [how to build](docs/build.md) - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md) - Use a Docker image, see [documentation for Docker](docs/docker.md) -- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases) +- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases) ## Obtaining and quantizing models @@ -258,14 +265,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c After downloading a model, use the CLI tools to run it locally - see below. -`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. +`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`: - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes -- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123) -- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268) -- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669) +- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123) +- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268) +- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669) To learn more about model quantization, [read this documentation](examples/quantize/README.md) @@ -488,9 +495,9 @@ To learn more about model quantization, [read this documentation](examples/quant - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch - Collaborators will be invited based on contributions - Any help with managing issues, PRs and projects is very appreciated! -- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions +- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information -- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205) +- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205) - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532) ## Other documentation @@ -505,7 +512,7 @@ To learn more about model quantization, [read this documentation](examples/quant - [Running on Docker](docs/docker.md) - [Build on Android](docs/android.md) - [Performance troubleshooting](docs/development/token_generation_performance_tips.md) -- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks) +- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks) #### Seminal papers and background on the models diff --git a/SECURITY.md b/SECURITY.md index f4322c6ee4d18..6a1bb6c32cd8e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp- However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. -Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). +Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new). A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/ci/README.md b/ci/README.md index 4064705190697..8245c9df65db8 100644 --- a/ci/README.md +++ b/ci/README.md @@ -1,11 +1,11 @@ # CI -In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework: +In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework: https://github.com/ggml-org/ci It monitors the `master` branch for new commits and runs the -[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us +[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled to cover various hardware architectures, including GPU and Apple Silicon instances. diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index e61015d2ad7f9..17146fffc1168 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -57,8 +57,7 @@ add_library(${TARGET} STATIC arg.h base64.hpp chat.cpp - chat.hpp - chat-template.hpp + chat.h common.cpp common.h console.cpp @@ -68,7 +67,8 @@ add_library(${TARGET} STATIC llguidance.cpp log.cpp log.h - minja.hpp + minja/chat-template.hpp + minja/minja.hpp ngram-cache.cpp ngram-cache.h sampling.cpp @@ -96,6 +96,22 @@ if (LLAMA_LLGUIDANCE) include(ExternalProject) set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source) set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release) + + # Set the correct library file extension based on platform + if (WIN32) + set(LLGUIDANCE_LIB_NAME "llguidance.lib") + # Add Windows-specific libraries + set(LLGUIDANCE_PLATFORM_LIBS + ws2_32 # Windows Sockets API + userenv # For GetUserProfileDirectoryW + ntdll # For NT functions + bcrypt # For BCryptGenRandom + ) + else() + set(LLGUIDANCE_LIB_NAME "libllguidance.a") + set(LLGUIDANCE_PLATFORM_LIBS "") + endif() + ExternalProject_Add(llguidance_ext GIT_REPOSITORY https://github.com/guidance-ai/llguidance # v0.6.12: @@ -106,17 +122,18 @@ if (LLAMA_LLGUIDANCE) CONFIGURE_COMMAND "" BUILD_COMMAND cargo build --release INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h + BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h UPDATE_COMMAND "" ) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE) add_library(llguidance STATIC IMPORTED) - set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a) + set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME}) add_dependencies(llguidance llguidance_ext) target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH}) - set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance) + # Add platform libraries to the main target + set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS}) endif () target_include_directories(${TARGET} PUBLIC .) diff --git a/common/arg.cpp b/common/arg.cpp index 144e93fa9b867..3c169b5b5f48e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2,6 +2,7 @@ #include "log.h" #include "sampling.h" +#include "chat.h" #include #include @@ -409,6 +410,10 @@ static void common_params_print_completion(common_params_context & ctx_arg) { printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); printf(" return 0\n"); printf(" ;;\n"); + printf(" --chat-template-file)\n"); + printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n"); + printf(" return 0\n"); + printf(" ;;\n"); printf(" *)\n"); printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n"); printf(" return 0\n"); @@ -1565,7 +1570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- isolate: only spawn threads on CPUs on the node that execution started on\n" "- numactl: use the CPU map provided by numactl\n" "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", + "see https://github.com/ggml-org/llama.cpp/issues/1437", [](common_params & params, const std::string & value) { /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } @@ -2243,7 +2248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( {"--log-prefix"}, - "Enable prefx in log messages", + "Enable prefix in log messages", [](common_params &) { common_log_set_prefix(common_log_main(), true); } @@ -2497,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--fim-qwen-1.5b-default"}, + string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-3b-default"}, + string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-7b-default"}, + string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/chat.cpp b/common/chat.cpp index 5b8e280aae341..9ebe4c5784cbc 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1,8 +1,433 @@ -#include "chat.hpp" -#include "chat-template.hpp" +#include "chat.h" #include "json-schema-to-grammar.h" #include "log.h" -#include "minja.hpp" +#include "minja/chat-template.hpp" +#include "minja/minja.hpp" + +#include + +typedef minja::chat_template common_chat_template; + +struct common_chat_templates { + bool has_explicit_template; // Model had builtin template or template overridde was specified. + std::unique_ptr template_default; // always set (defaults to chatml) + std::unique_ptr template_tool_use; +}; + +struct templates_params { + json messages; + json tools; + common_chat_tool_choice tool_choice; + json json_schema; + bool parallel_tool_calls; + bool stream; + std::string grammar; + bool add_generation_prompt = true; + bool extract_reasoning = true; +}; + +common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { + if (tool_choice == "auto") { + return COMMON_CHAT_TOOL_CHOICE_AUTO; + } + if (tool_choice == "none") { + return COMMON_CHAT_TOOL_CHOICE_NONE; + } + if (tool_choice == "required") { + return COMMON_CHAT_TOOL_CHOICE_REQUIRED; + } + throw std::runtime_error("Invalid tool_choice: " + tool_choice); +} + +template <> +std::vector common_chat_msgs_parse_oaicompat(const json & messages) { + std::vector msgs; + + try { + + if (!messages.is_array()) { + throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump()); + } + + for (const auto & message : messages) { + if (!message.is_object()) { + throw std::runtime_error("Expected 'message' to be an object, got " + message.dump()); + } + + common_chat_msg msg; + if (!message.contains("role")) { + throw std::runtime_error("Missing 'role' in message: " + message.dump()); + } + msg.role = message.at("role"); + + if (message.contains("content")) { + const auto & content = message.at("content"); + if (content.is_string()) { + msg.content = content; + } else if (content.is_array()) { + for (const auto & part : content) { + if (!part.contains("type")) { + throw std::runtime_error("Missing content part type: " + part.dump()); + } + const auto & type = part.at("type"); + if (type != "text") { + throw std::runtime_error("Unsupported content part type: " + type.dump()); + } + common_chat_msg_content_part msg_part; + msg_part.type = type; + msg_part.text = part.at("text"); + msg.content_parts.push_back(msg_part); + } + } else if (!content.is_null()) { + throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); + } + } else { + throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); + } + if (message.contains("reasoning_content")) { + msg.reasoning_content = message.at("reasoning_content"); + } + if (message.contains("name")) { + msg.tool_name = message.at("name"); + } + if (message.contains("tool_call_id")) { + msg.tool_call_id = message.at("tool_call_id"); + } + if (message.contains("tool_calls")) { + for (const auto & tool_call : message.at("tool_calls")) { + common_chat_tool_call tc; + if (!tool_call.contains("type")) { + throw std::runtime_error("Missing tool call type: " + tool_call.dump()); + } + const auto & type = tool_call.at("type"); + if (type != "function") { + throw std::runtime_error("Unsupported tool call type: " + tool_call.dump()); + } + if (!tool_call.contains("function")) { + throw std::runtime_error("Missing tool call function: " + tool_call.dump()); + } + const auto & fc = tool_call.at("function"); + if (!fc.contains("name")) { + throw std::runtime_error("Missing tool call name: " + tool_call.dump()); + } + tc.name = fc.at("name"); + tc.arguments = fc.at("arguments"); + if (tool_call.contains("id")) { + tc.id = tool_call.at("id"); + } + msg.tool_calls.push_back(tc); + } + } + + msgs.push_back(msg); + } + } catch (const std::exception & e) { + throw std::runtime_error("Failed to parse messages: " + std::string(e.what()) + "; messages = " + messages.dump(2)); + } + + return msgs; +} + +template <> +json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { + json messages = json::array(); + for (const auto & msg : msgs) { + if (!msg.content.empty() && !msg.content_parts.empty()) { + throw std::runtime_error("Cannot specify both content and content_parts"); + } + json jmsg { + {"role", msg.role}, + }; + if (!msg.content.empty()) { + jmsg["content"] = msg.content; + } else if (!msg.content_parts.empty()) { + if (concat_typed_text) { + std::string text; + for (const auto & part : msg.content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); + continue; + } + if (!text.empty()) { + text += '\n'; + } + text += part.text; + } + jmsg["content"] = text; + } else { + auto & parts = jmsg["content"] = json::array(); + for (const auto & part : msg.content_parts) { + parts.push_back({ + {"type", part.type}, + {"text", part.text}, + }); + } + } + } else { + jmsg["content"] = json(); // null + } + if (!msg.reasoning_content.empty()) { + jmsg["reasoning_content"] = msg.reasoning_content; + } + if (!msg.tool_name.empty()) { + jmsg["name"] = msg.tool_name; + } + if (!msg.tool_call_id.empty()) { + jmsg["tool_call_id"] = msg.tool_call_id; + } + if (!msg.tool_calls.empty()) { + auto & tool_calls = jmsg["tool_calls"] = json::array(); + for (const auto & tool_call : msg.tool_calls) { + json tc { + {"type", "function"}, + {"function", { + {"name", tool_call.name}, + {"arguments", tool_call.arguments}, + }}, + }; + if (!tool_call.id.empty()) { + tc["id"] = tool_call.id; + } + tool_calls.push_back(tc); + } + } + messages.push_back(jmsg); + } + return messages; +} + +template <> +std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) { + return common_chat_msgs_parse_oaicompat(json::parse(messages)); +} + +template <> +std::vector common_chat_tools_parse_oaicompat(const json & tools) { + std::vector result; + + try { + if (!tools.is_null()) { + if (!tools.is_array()) { + throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump()); + } + for (const auto & tool : tools) { + if (!tool.contains("type")) { + throw std::runtime_error("Missing tool type: " + tool.dump()); + } + const auto & type = tool.at("type"); + if (!type.is_string() || type != "function") { + throw std::runtime_error("Unsupported tool type: " + tool.dump()); + } + if (!tool.contains("function")) { + throw std::runtime_error("Missing tool function: " + tool.dump()); + } + + const auto & function = tool.at("function"); + result.push_back({ + /* .name = */ function.at("name"), + /* .description = */ function.at("description"), + /* .parameters = */ function.at("parameters").dump(), + }); + } + } + } catch (const std::exception & e) { + throw std::runtime_error("Failed to parse tools: " + std::string(e.what()) + "; tools = " + tools.dump(2)); + } + + return result; +} + +template <> +std::vector common_chat_tools_parse_oaicompat(const std::string & tools) { + return common_chat_tools_parse_oaicompat(json::parse(tools)); +} + +template <> +json common_chat_tools_to_json_oaicompat(const std::vector & tools) { + if (tools.empty()) { + return json(); + } + + auto result = json::array(); + for (const auto & tool : tools) { + result.push_back({ + {"type", "function"}, + {"function", { + {"name", tool.name}, + {"description", tool.description}, + {"parameters", json::parse(tool.parameters)}, + }}, + }); + } + return result; +} + +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { + if (use_jinja) { + try { + common_chat_msg msg; + msg.role = "user"; + msg.content = "test"; + + auto tmpls = common_chat_templates_init(/* model= */ nullptr, tmpl); + + common_chat_templates_inputs inputs; + inputs.messages = {msg}; + + common_chat_templates_apply(tmpls.get(), inputs); + return true; + } catch (const std::exception & e) { + LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what()); + return false; + } + } + llama_chat_message chat[] = {{"user", "test"}}; + const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); + return res >= 0; +} + +std::string common_chat_format_single( + const struct common_chat_templates * tmpls, + const std::vector & past_msg, + const common_chat_msg & new_msg, + bool add_ass, + bool use_jinja) { + + common_chat_templates_inputs inputs; + inputs.use_jinja = use_jinja; + + std::string fmt_past_msg; + if (!past_msg.empty()) { + inputs.messages = past_msg; + inputs.add_generation_prompt = false; + fmt_past_msg = common_chat_templates_apply(tmpls, inputs).prompt; + } + std::ostringstream ss; + // if the past_msg ends with a newline, we must preserve it in the formatted version + if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { + ss << "\n"; + }; + // format chat with new_msg + inputs.messages.push_back(new_msg); + inputs.add_generation_prompt = add_ass; + auto fmt_new_msg = common_chat_templates_apply(tmpls, inputs).prompt; + // get the diff part + ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); + return ss.str(); +} + +std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) { + common_chat_templates_inputs inputs; + inputs.use_jinja = use_jinja; + auto add_simple_msg = [&](auto role, auto content) { + common_chat_msg msg; + msg.role = role; + msg.content = content; + inputs.messages.push_back(msg); + }; + add_simple_msg("system", "You are a helpful assistant"); + add_simple_msg("user", "Hello"); + add_simple_msg("assistant", "Hi there"); + add_simple_msg("user", "How are you?"); + return common_chat_templates_apply(tmpls, inputs).prompt; +} + +#define CHATML_TEMPLATE_SRC \ + "{%- for message in messages -%}\n" \ + " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \ + "{%- endfor -%}\n" \ + "{%- if add_generation_prompt -%}\n" \ + " {{- '<|im_start|>assistant\n' -}}\n" \ + "{%- endif -%}" + +void common_chat_templates_free(struct common_chat_templates * tmpls) { + delete tmpls; +} + +bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls) { + return tmpls->has_explicit_template; +} + +const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) { + if (variant != nullptr) { + if (strcmp(variant, "tool_use") == 0) { + if (tmpls->template_tool_use) { + return tmpls->template_tool_use->source().c_str(); + } + return nullptr; + } else { + LOG_DBG("%s: unknown template variant: %s\n", __func__, variant); + } + } + return tmpls->template_default->source().c_str(); +} + +common_chat_templates_ptr common_chat_templates_init( + const struct llama_model * model, + const std::string & chat_template_override, + const std::string & bos_token_override, + const std::string & eos_token_override) +{ + std::string default_template_src; + std::string template_tool_use_src; + + bool has_explicit_template = !chat_template_override.empty(); + if (chat_template_override.empty()) { + GGML_ASSERT(model != nullptr); + const auto * str = llama_model_chat_template(model, /* name */ nullptr); + if (str) { + default_template_src = str; + has_explicit_template = true; + } + str = llama_model_chat_template(model, /* name */ "tool_use"); + if (str) { + template_tool_use_src = str; + has_explicit_template = true; + } + } else { + default_template_src = chat_template_override; + } + if (default_template_src.empty() || default_template_src == "chatml") { + if (!template_tool_use_src.empty()) { + default_template_src = template_tool_use_src; + } else { + default_template_src = CHATML_TEMPLATE_SRC; + } + } + std::string token_bos = bos_token_override; + std::string token_eos = eos_token_override; + if (model) { + const auto * vocab = llama_model_get_vocab(model); + const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { + if (token == LLAMA_TOKEN_NULL) { + if (default_template_src.find(jinja_variable_name) != std::string::npos + || template_tool_use_src.find(jinja_variable_name) != std::string::npos) { + LOG_WRN("common_chat_templates_init: warning: vocab does not have a %s token, jinja template won't work as intended.\n", name); + } + return std::string(); + } + return common_token_to_piece(vocab, token, true); + }; + token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); + token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); + } + common_chat_templates_ptr tmpls(new common_chat_templates()); + tmpls->has_explicit_template = has_explicit_template; + try { + tmpls->template_default = std::make_unique(default_template_src, token_bos, token_eos); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse chat template (defaulting to chatml): %s \n", __func__, e.what()); + tmpls->template_default = std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos); + } + if (!template_tool_use_src.empty()) { + try { + tmpls->template_tool_use = std::make_unique(template_tool_use_src, token_bos, token_eos); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse tool use chat template (ignoring it): %s\n", __func__, e.what()); + } + } + return tmpls; +} std::string common_chat_format_name(common_chat_format format) { switch (format) { @@ -38,22 +463,22 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons json_error_locator() : position(0), found_error(false) {} - bool parse_error(std::size_t position, const std::string &, const json::exception &) override { + bool parse_error(std::size_t position, const std::string &, const json::exception &) override { // NOLINT this->position = position - 1; this->found_error = true; return false; } - bool null() override { return true; } - bool boolean(bool) override { return true; } - bool number_integer(number_integer_t) override { return true; } - bool number_unsigned(number_unsigned_t) override { return true; } - bool number_float(number_float_t, const string_t &) override { return true; } - bool string(string_t &) override { return true; } - bool binary(binary_t &) override { return true; } - bool start_object(std::size_t) override { return true; } - bool key(string_t &) override { return true; } + bool null() override { return true; } // NOLINT + bool boolean(bool) override { return true; } // NOLINT + bool number_integer(number_integer_t) override { return true; } // NOLINT + bool number_unsigned(number_unsigned_t) override { return true; } // NOLINT + bool number_float(number_float_t, const string_t &) override { return true; } // NOLINT + bool string(string_t &) override { return true; } // NOLINT + bool binary(binary_t &) override { return true; } // NOLINT + bool start_object(std::size_t) override { return true; } // NOLINT + bool key(string_t &) override { return true; } // NOLINT bool end_object() override { return true; } - bool start_array(std::size_t) override { return true; } + bool start_array(std::size_t) override { return true; } // NOLINT bool end_array() override { return true; } }; json_error_locator err_loc; @@ -187,13 +612,20 @@ static std::string apply( // tmpl_inputs.now = std::chrono::system_clock::now(); minja::chat_template_options tmpl_opts; - tmpl_opts.use_bos_token = false; - tmpl_opts.use_eos_token = false; - - return tmpl.apply(tmpl_inputs, tmpl_opts); + // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens + // instead of using `chat_template_options.use_bos_token = false`, since these tokens + // may be needed inside the template / between messages too. + auto result = tmpl.apply(tmpl_inputs, tmpl_opts); + if (string_starts_with(result, tmpl.bos_token())) { + result = result.substr(tmpl.bos_token().size()); + } + if (string_ends_with(result, tmpl.eos_token())) { + result = result.substr(0, result.size() - tmpl.eos_token().size()); + } + return result; } -static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_generic(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; auto tool_call_schemas = json::array(); @@ -247,7 +679,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp {"required", json::array({"tool_call"})}, }; const auto schema = - inputs.tool_choice != "required" + inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED ? json { {"anyOf", json::array({ tool_call, @@ -303,9 +735,9 @@ static common_chat_msg common_chat_parse_generic(const std::string & input) { return result; } -static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_mistral_nemo(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { @@ -348,9 +780,9 @@ static common_chat_msg common_chat_parse_mistral_nemo(const std::string & input) return parse_prefixed_json_tool_call_array(input, "[TOOL_CALLS]"); } -static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { @@ -455,10 +887,10 @@ static void expect_tool_parameters(const std::string & name, const json & parame const auto & parameters_required = parameters.at("required"); for (const auto & prop : expected_properties) { if (!parameters_properties.contains(prop)) { - throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop); + throw std::runtime_error("Parameters of tool " + name + " is missing property: " + prop); // NOLINT } if (std::find(parameters_required.begin(), parameters_required.end(), json(prop)) == parameters_required.end()) { - throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop); + throw std::runtime_error("Parameters of tool " + name + " must have property marked as required: " + prop); // NOLINT } } if (parameters_properties.size() != expected_properties.size()) { @@ -466,18 +898,16 @@ static void expect_tool_parameters(const std::string & name, const json & parame } } -static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct common_chat_inputs & inputs, bool allow_python_tag_builtin_tools) { +static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const common_chat_template & tmpl, const struct templates_params & inputs, bool allow_python_tag_builtin_tools) { auto builtin_tools = json::array(); common_chat_params data; - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; auto handle_builtin_tool = [&](const std::string & name, const json & parameters) { - if (name == "wolfram_alpha") { + if (name == "wolfram_alpha" || name == "web_search" || name == "brave_search") { // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py - expect_tool_parameters(name, parameters, {"query"}); - } else if (name == "web_search" || name == "brave_search") { // https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py expect_tool_parameters(name, parameters, {"query"}); } else if (name == "python" || name == "code_interpreter") { @@ -489,7 +919,7 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com std::vector kvs; for (const auto & [key, value] : parameters.at("properties").items()) { - kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); + kvs.push_back("\"" + key + "=\" " + builder.add_schema(name + "-args-" + key, value)); // NOLINT } tool_rules.push_back( @@ -560,34 +990,33 @@ static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bo auto arg_value_str = raw_args.substr(it_eq + 1); auto arg_value = json::parse(arg_value_str); - return { - /* .role = */ "assistant", - /* .content = */ match.prefix().str(), - /* .tool_calls = */ { - { - /* .name = */ match[1], - /* .arguments = */ (json { - {arg_name, arg_value}, - }).dump(), - /* .id = */ "", - }, - }, - }; + common_chat_msg msg; + msg.role = "assistant"; + msg.content = match.prefix().str(); + msg.tool_calls.push_back({ + /* .name = */ name, + /* .arguments = */ (json { + {arg_name, arg_value}, + }).dump(), + /* .id = */ "", + }); + return msg; } } return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } -static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; if (inputs.tools.is_array() && !inputs.tools.empty()) { - data.grammar_lazy = inputs.tool_choice != "required" && inputs.json_schema.is_null(); + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null(); data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; foreach_function(inputs.tools, [&](const json & tool) { const auto & function = tool.at("function"); std::string name = function.at("name"); auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); auto args_rule = builder.add_schema(name + "-args", parameters); tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n" @@ -666,15 +1095,15 @@ static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, return msg; } -static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { - fprintf(stderr, "%s\n", __func__); +static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { + LOG_DBG("%s\n", __func__); common_chat_params data; data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { {"datetime", "Jan 29 2025 13:00:00 GMT"}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, }); if (inputs.tools.is_array() && !inputs.tools.empty()) { - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { auto schemas = json::array(); foreach_function(inputs.tools, [&](const json & tool) { @@ -712,14 +1141,14 @@ static common_chat_msg common_chat_parse_firefunction_v2(const std::string & inp return parse_prefixed_json_tool_call_array(input, " functools[", /* rstrip_prefix= */ 1); } -static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_functionary_v3_2(const common_chat_template & tmpl, const struct templates_params & inputs) { // >>>all\nlet's call functions>>>fn1\n{"arg1": 1...}\n>>>fn2\n{"arg1": 1...}... // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar common_chat_params data; data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; if (inputs.tools.is_array() && !inputs.tools.empty()) { - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector first_tool_rules; std::vector subsequent_tool_rules; @@ -727,6 +1156,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ const auto & function = tool.at("function"); std::string name = function.at("name"); auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); auto args_rule = builder.add_schema(name + "-args", parameters); first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule)); subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule)); @@ -795,14 +1225,14 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in } } -static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(const common_chat_template & tmpl, const struct templates_params & inputs) { // https://github.com/MeetKai/functionary/blob/main/tests/prompt_test_v3-llama3.1.txt common_chat_params data; json tools = inputs.tools.is_null() ? inputs.tools : json::array(); std::string python_code_argument_name; auto has_raw_python = false; - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; foreach_function(inputs.tools, [&](const json & tool) { @@ -814,7 +1244,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con throw std::runtime_error("Missing type in python tool"); } has_raw_python = true; - auto type = parameters.at("type"); + const auto & type = parameters.at("type"); if (type == "object") { auto properties = parameters.at("properties"); for (auto it = properties.begin(); it != properties.end(); ++it) { @@ -854,17 +1284,15 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s std::smatch match; if (std::regex_search(input, match, python_tag_regex)) { auto code = match[1].str(); - return { - /* .role = */ "assistant", - /* .content = */ match.prefix().str(), - /* .tool_calls = */ { - { - /* .name = */ "python", - /* .arguments = */ (json {{"code", code}}).dump(), - /* .id = */ "", - }, - } - }; + common_chat_msg msg; + msg.role = "assistant"; + msg.content = match.prefix().str(); + msg.tool_calls.push_back({ + /* .name = */ "python", + /* .arguments = */ (json {{"code", code}}).dump(), + /* .id = */ "", + }); + return msg; } static std::regex function_regex(R"()"); static std::regex close_regex(R"()"); @@ -872,10 +1300,10 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } -static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; // (content)?({"name": "foo", "arguments": {"a": 1}})* - data.grammar_lazy = inputs.tool_choice != "required"; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; foreach_function(inputs.tools, [&](const json & tool) { @@ -908,20 +1336,18 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) std::regex middle_pattern(R"([\n\s]*[\n\s]*)"); std::regex end_pattern(R"([\n\s]*[\n\s]*$)"); + common_chat_msg msg; + msg.role = "assistant"; + auto end = input.end(); std::sregex_iterator rend; std::sregex_iterator rit(input.begin(), end, start_pattern); if (rit == rend) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; + msg.content = input; + return msg; } - common_chat_msg result; - result.role = "assistant"; - result.content = rit->prefix(); + msg.content = rit->prefix(); auto it = rit->suffix().first; while (it != end) { @@ -930,7 +1356,7 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) throw std::runtime_error("Failed to parse json tool call"); } const auto & arguments = call.at("arguments"); - result.tool_calls.push_back({ + msg.tool_calls.push_back({ call.at("name"), arguments.dump(), // arguments.is_string() ? arguments.get() : arguments.dump(), @@ -947,17 +1373,17 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) break; } } - return result; + return msg; } catch (const std::exception & e) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; + LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what()); + common_chat_msg msg; + msg.role = "assistant"; + msg.content = input; + return msg; } } -static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; @@ -968,17 +1394,40 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha } data.grammar = json_schema_to_grammar(inputs.json_schema); } else { - data.grammar = inputs.grammar.empty(); + data.grammar = inputs.grammar; } return data; } -common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) { +static common_chat_params common_chat_templates_apply_jinja( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + templates_params params; + params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); + const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use + ? *tmpls->template_tool_use + : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + params.add_generation_prompt = inputs.add_generation_prompt; + params.extract_reasoning = inputs.extract_reasoning; + params.tool_choice = inputs.tool_choice; + params.grammar = inputs.grammar; + if (!inputs.json_schema.empty()) { + params.json_schema = json::parse(inputs.json_schema); + } - if (inputs.tools.is_array()) { - if (inputs.tool_choice != "none" && !inputs.grammar.empty()) { + if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { + LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); + params.parallel_tool_calls = false; + } else { + params.parallel_tool_calls = inputs.parallel_tool_calls; + } + + if (params.tools.is_array()) { + if (params.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && !params.grammar.empty()) { throw std::runtime_error("Cannot specify grammar with tools"); } if (caps.supports_tool_calls && !caps.supports_tools) { @@ -987,68 +1436,135 @@ common_chat_params common_chat_params_init(const common_chat_template & tmpl, co } // DeepSeek R1: use handler in all cases except json schema (thinking / tools). - if (src.find("<|tool▁calls▁begin|>") != std::string::npos && inputs.json_schema.is_null()) { - return common_chat_params_init_deepseek_r1(tmpl, inputs); + if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_deepseek_r1(tmpl, params); } // Command R7B: : use handler in all cases except json schema (thinking / tools). - if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && inputs.json_schema.is_null()) { - return common_chat_params_init_command_r7b(tmpl, inputs); + if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_command_r7b(tmpl, params); } // Use generic handler when mixing tools + JSON schema. // TODO: support that mix in handlers below. - if ((!inputs.tools.is_array() && inputs.json_schema.is_object())) { - return common_chat_params_init_generic(tmpl, inputs); + if ((params.tools.is_array() && params.json_schema.is_object())) { + return common_chat_params_init_generic(tmpl, params); } // Functionary prepends "all\n" to plain content outputs, so we use its handler in all cases. if (src.find(">>>all") != std::string::npos) { - return common_chat_params_init_functionary_v3_2(tmpl, inputs); + return common_chat_params_init_functionary_v3_2(tmpl, params); } // Firefunction v2 requires datetime and functions in the context even w/o tools, so we also use its handler in all cases. if (src.find(" functools[") != std::string::npos) { - return common_chat_params_init_firefunction_v2(tmpl, inputs); + return common_chat_params_init_firefunction_v2(tmpl, params); } // Plain handler (no tools) - if (inputs.tools.is_null() || inputs.tool_choice == "none") { - return common_chat_params_init_without_tools(tmpl, inputs); + if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { + return common_chat_params_init_without_tools(tmpl, params); } // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) if (src.find("") != std::string::npos) { - return common_chat_params_init_hermes_2_pro(tmpl, inputs); + return common_chat_params_init_hermes_2_pro(tmpl, params); } // Functionary v3.1 (w/ tools) if (src.find("<|start_header_id|>") != std::string::npos && src.find("ipython<|end_header_id|>") != std::string::npos) { auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos; - return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools); + return common_chat_params_init_llama_3_1_tool_calls(tmpl, params, allow_python_tag_builtin_tools); } // Mistral Nemo (w/ tools) if (src.find("[TOOL_CALLS]") != std::string::npos) { - return common_chat_params_init_mistral_nemo(tmpl, inputs); + return common_chat_params_init_mistral_nemo(tmpl, params); } // Generic fallback - return common_chat_params_init_generic(tmpl, inputs); + return common_chat_params_init_generic(tmpl, params); +} + +// Legacy template route (adhoc C++ implementation of known templates), forward to llama_chat_apply_template. +static common_chat_params common_chat_templates_apply_legacy( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + int alloc_size = 0; + std::vector chat; + std::vector contents; + for (const auto & msg : inputs.messages) { + auto content = msg.content; + for (const auto & part : msg.content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring non-text content part: %s\n", part.type.c_str()); + continue; + } + if (!content.empty()) { + content += "\n";; + } + content += part.text; + } + contents.emplace_back(std::move(content)); + } + for (size_t i = 0; i < contents.size(); ++i) { + const auto & msg = inputs.messages[i]; + const auto & content = contents[i]; + chat.push_back({msg.role.c_str(), content.c_str()}); + alloc_size += (msg.role.size() + content.size()) * 1.25; + } + + std::vector buf(alloc_size); + + // run the first time to get the total output length + const auto & src = tmpls->template_default->source(); + int32_t res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size()); + + // error: chat template is not supported + if (res < 0) { + // if the custom "tmpl" is not supported, we throw an error + // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() + throw std::runtime_error("this custom template is not supported"); + } + + // if it turns out that our buffer is too small, we resize it + if ((size_t) res > buf.size()) { + buf.resize(res); + res = llama_chat_apply_template(src.c_str(), chat.data(), chat.size(), inputs.add_generation_prompt, buf.data(), buf.size()); + } + + common_chat_params params; + params.prompt = std::string(buf.data(), res); + if (!inputs.json_schema.empty()) { + params.grammar = json_schema_to_grammar(json::parse(inputs.json_schema)); + } else { + params.grammar = inputs.grammar; + } + return params; +} + +common_chat_params common_chat_templates_apply( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs) +{ + GGML_ASSERT(tmpls != nullptr); + return inputs.use_jinja + ? common_chat_templates_apply_jinja(tmpls, inputs) + : common_chat_templates_apply_legacy(tmpls, inputs); } static common_chat_msg common_chat_parse_content_only(const std::string & input) { - return { - /* .role = */ "assistant", - /* .content = */ input, - /* .tool_calls = */ {}, - }; + common_chat_msg msg; + msg.role = "assistant"; + msg.content = input; + return msg; } common_chat_msg common_chat_parse(const std::string & input, common_chat_format format) { diff --git a/common/chat.h b/common/chat.h new file mode 100644 index 0000000000000..e77bef82b9edd --- /dev/null +++ b/common/chat.h @@ -0,0 +1,134 @@ +// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers. + +#pragma once + +#include "common.h" +#include +#include + +struct common_chat_templates; + +struct common_chat_tool_call { + std::string name; + std::string arguments; + std::string id; +}; + +struct common_chat_msg_content_part { + std::string type; + std::string text; +}; + +struct common_chat_msg { + std::string role; + std::string content; + std::vector content_parts = {}; + std::vector tool_calls = {}; + std::string reasoning_content; + std::string tool_name; + std::string tool_call_id; +}; + +struct common_chat_tool { + std::string name; + std::string description; + std::string parameters; +}; + +enum common_chat_tool_choice { + COMMON_CHAT_TOOL_CHOICE_AUTO, + COMMON_CHAT_TOOL_CHOICE_REQUIRED, + COMMON_CHAT_TOOL_CHOICE_NONE, +}; + +enum common_chat_format { + COMMON_CHAT_FORMAT_CONTENT_ONLY, + COMMON_CHAT_FORMAT_GENERIC, + COMMON_CHAT_FORMAT_MISTRAL_NEMO, + COMMON_CHAT_FORMAT_LLAMA_3_X, + COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, + COMMON_CHAT_FORMAT_DEEPSEEK_R1, + COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, + COMMON_CHAT_FORMAT_FIREFUNCTION_V2, + COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, + COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, + COMMON_CHAT_FORMAT_HERMES_2_PRO, + COMMON_CHAT_FORMAT_COMMAND_R7B, + COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, + + COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats +}; + +struct common_chat_templates_inputs { + std::vector messages; + std::string grammar; + std::string json_schema; + bool add_generation_prompt = true; + bool use_jinja = true; + // Parameters below only supported when use_jinja is true + std::vector tools; + common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; + bool parallel_tool_calls = false; + bool extract_reasoning = true; +}; + +struct common_chat_params { + common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + std::string prompt; + std::string grammar; + bool grammar_lazy = false; + std::vector grammar_triggers; + std::vector preserved_tokens; + std::vector additional_stops; +}; + +// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid +bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); + +void common_chat_templates_free(struct common_chat_templates * tmpls); + +struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } }; + +typedef std::unique_ptr common_chat_templates_ptr; + +common_chat_templates_ptr common_chat_templates_init( + const struct llama_model * model, + const std::string & chat_template_override, + const std::string & bos_token_override = "", + const std::string & eos_token_override = ""); + +bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls); +const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr); + + +struct common_chat_params common_chat_templates_apply( + const struct common_chat_templates * tmpls, + const struct common_chat_templates_inputs & inputs); + +// Format single message, while taking into account the position of that message in chat history +std::string common_chat_format_single( + const struct common_chat_templates * tmpls, + const std::vector & past_msg, + const common_chat_msg & new_msg, + bool add_ass, + bool use_jinja); + +// Returns an example of formatted chat +std::string common_chat_format_example( + const struct common_chat_templates * tmpls, + bool use_jinja); + +std::string common_chat_format_name(common_chat_format format); +common_chat_msg common_chat_parse( const std::string & input, common_chat_format format); + +common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); + +// Parses a JSON array of messages in OpenAI's chat completion API format. +// T can be std::string containing JSON or nlohmann::ordered_json +template std::vector common_chat_msgs_parse_oaicompat(const T & messages); +template T common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); + +// Parses a JSON array of tools in OpenAI's chat completion tool call API format. +// T can be std::string containing JSON or nlohmann::ordered_json +template std::vector common_chat_tools_parse_oaicompat(const T & tools); +template T common_chat_tools_to_json_oaicompat(const std::vector & tools); diff --git a/common/chat.hpp b/common/chat.hpp deleted file mode 100644 index ba1632f669cf7..0000000000000 --- a/common/chat.hpp +++ /dev/null @@ -1,55 +0,0 @@ -// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers. - -#pragma once - -#include "common.h" -#include -#include -#include -#include - -using json = nlohmann::ordered_json; - -struct common_chat_inputs { - json messages; - json tools; - json tool_choice; - json json_schema; - bool parallel_tool_calls; - bool stream; - std::string grammar; - bool add_generation_prompt = true; - bool extract_reasoning = true; -}; - -enum common_chat_format { - COMMON_CHAT_FORMAT_CONTENT_ONLY, - COMMON_CHAT_FORMAT_GENERIC, - COMMON_CHAT_FORMAT_MISTRAL_NEMO, - COMMON_CHAT_FORMAT_LLAMA_3_X, - COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS, - COMMON_CHAT_FORMAT_DEEPSEEK_R1, - COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, - COMMON_CHAT_FORMAT_FIREFUNCTION_V2, - COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, - COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1, - COMMON_CHAT_FORMAT_HERMES_2_PRO, - COMMON_CHAT_FORMAT_COMMAND_R7B, - COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, - - COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats -}; - -struct common_chat_params { - common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - json prompt; - std::string grammar; - bool grammar_lazy = false; - std::vector grammar_triggers; - std::vector preserved_tokens; - std::vector additional_stops; -}; - -struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params); -std::string common_chat_format_name(common_chat_format format); -common_chat_msg common_chat_parse( const std::string & input, common_chat_format format); diff --git a/common/common.cpp b/common/common.cpp index 8661e164ada6b..d2b0d50e3ee39 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -12,8 +12,6 @@ #include "json.hpp" #include "json-schema-to-grammar.h" #include "llama.h" -#include "chat.hpp" -#include "chat-template.hpp" #include #include @@ -1768,174 +1766,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto return text; } -// -// Chat template utils -// - -bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) { - if (use_jinja) { - try { - auto chat_template = common_chat_template(tmpl, "", ""); - common_chat_inputs inputs; - inputs.messages = json::array({{ - {"role", "user"}, - {"content", "test"}, - }}); - common_chat_params_init(chat_template, inputs); - return true; - } catch (const std::exception & e) { - LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what()); - return false; - } - } - llama_chat_message chat[] = {{"user", "test"}}; - const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0); - return res >= 0; -} - -std::string common_chat_apply_template( - const common_chat_template & tmpl, - const std::vector & msgs, - bool add_ass, - bool use_jinja) { - if (use_jinja) { - auto messages = json::array(); - for (const auto & msg : msgs) { - messages.push_back({{"role", msg.role}, {"content", msg.content}}); - } - common_chat_inputs inputs; - inputs.messages = messages; - inputs.add_generation_prompt = add_ass; - return common_chat_params_init(tmpl, inputs).prompt; - } - - int alloc_size = 0; - std::vector chat; - for (const auto & msg : msgs) { - chat.push_back({msg.role.c_str(), msg.content.c_str()}); - alloc_size += (msg.role.size() + msg.content.size()) * 1.25; - } - - std::vector buf(alloc_size); - - // run the first time to get the total output length - int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); - - // error: chat template is not supported - if (res < 0) { - // if the custom "tmpl" is not supported, we throw an error - // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() - throw std::runtime_error("this custom template is not supported"); - } - - // if it turns out that our buffer is too small, we resize it - if ((size_t) res > buf.size()) { - buf.resize(res); - res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size()); - } - - std::string formatted_chat(buf.data(), res); - return formatted_chat; -} - -std::string common_chat_format_single( - const common_chat_template & tmpl, - const std::vector & past_msg, - const common_chat_msg & new_msg, - bool add_ass, - bool use_jinja) { - std::ostringstream ss; - auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja); - std::vector chat_new(past_msg); - // if the past_msg ends with a newline, we must preserve it in the formatted version - if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { - ss << "\n"; - }; - // format chat with new_msg - chat_new.push_back(new_msg); - auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja); - // get the diff part - ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size()); - return ss.str(); -} - -std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) { - std::vector msgs = { - {"system", "You are a helpful assistant", {}}, - {"user", "Hello", {}}, - {"assistant", "Hi there", {}}, - {"user", "How are you?", {}}, - }; - return common_chat_apply_template(tmpl, msgs, true, use_jinja); -} - -#define CHATML_TEMPLATE_SRC \ - "{%- for message in messages -%}\n" \ - " {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \ - "{%- endfor -%}\n" \ - "{%- if add_generation_prompt -%}\n" \ - " {{- '<|im_start|>assistant\n' -}}\n" \ - "{%- endif -%}" - -common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override) -{ - std::string default_template_src; - std::string template_tool_use_src; - - bool has_explicit_template = !chat_template_override.empty(); - if (chat_template_override.empty()) { - auto str = llama_model_chat_template(model, /* name */ nullptr); - if (str) { - default_template_src = str; - has_explicit_template = true; - } - str = llama_model_chat_template(model, /* name */ "tool_use"); - if (str) { - template_tool_use_src = str; - has_explicit_template = true; - } - } else { - default_template_src = chat_template_override; - } - if (default_template_src.empty() || default_template_src == "chatml") { - if (!template_tool_use_src.empty()) { - default_template_src = template_tool_use_src; - } else { - default_template_src = CHATML_TEMPLATE_SRC; - } - } - auto vocab = llama_model_get_vocab(model); - const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) { - if (token == LLAMA_TOKEN_NULL) { - if (default_template_src.find(jinja_variable_name) != std::string::npos - || template_tool_use_src.find(jinja_variable_name) != std::string::npos) { - LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name); - } - return std::string(); - } else { - return common_token_to_piece(vocab, token, true); - } - }; - auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token"); - auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token"); - try { - return { - has_explicit_template, - std::make_unique(default_template_src, token_bos, token_eos), - template_tool_use_src.empty() - ? nullptr - : std::make_unique(template_tool_use_src, token_bos, token_eos), - }; - } catch (const std::exception & e) { - LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what()); - return { - has_explicit_template, - std::make_unique(CHATML_TEMPLATE_SRC, token_bos, token_eos), - nullptr, - }; - } -} - // // KV cache utils // diff --git a/common/common.h b/common/common.h index 98b9a4464787a..efe8e7f796521 100644 --- a/common/common.h +++ b/common/common.h @@ -178,10 +178,10 @@ struct common_params_speculative { int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.9f; // minimum speculative decoding probability (greedy) + float p_min = 0.75f; // minimum speculative decoding probability (greedy) struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -616,62 +616,6 @@ std::string common_detokenize( const std::vector & tokens, bool special = true); -// -// Chat template utils -// - -struct common_tool_call { - std::string name; - std::string arguments; - std::string id; -}; - -// same with llama_chat_message, but uses std::string -struct common_chat_msg { - std::string role; - std::string content; - std::vector tool_calls; - std::string reasoning_content = ""; -}; - -// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid -bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); - -namespace minja { - class chat_template; -} - -typedef minja::chat_template common_chat_template; - -struct common_chat_templates { - bool has_explicit_template; // Model had builtin template or template overridde was specified. - std::unique_ptr template_default; // always set (defaults to chatml) - std::unique_ptr template_tool_use; -}; - -// CPP wrapper for llama_chat_apply_template -// If the built-in template is not supported, we default to chatml -// If the custom "tmpl" is not supported, we throw an error -std::string common_chat_apply_template( - const common_chat_template & tmpl, - const std::vector & chat, - bool add_ass, - bool use_jinja); - -// Format single message, while taking into account the position of that message in chat history -std::string common_chat_format_single( - const common_chat_template & tmpl, - const std::vector & past_msg, - const common_chat_msg & new_msg, - bool add_ass, - bool use_jinja); - -// Returns an example of formatted chat -std::string common_chat_format_example( - const common_chat_template & tmpl, bool use_jinja); - -common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override); - // // KV cache utils // diff --git a/common/chat-template.hpp b/common/minja/chat-template.hpp similarity index 100% rename from common/chat-template.hpp rename to common/minja/chat-template.hpp diff --git a/common/minja.hpp b/common/minja/minja.hpp similarity index 100% rename from common/minja.hpp rename to common/minja/minja.hpp diff --git a/common/speculative.cpp b/common/speculative.cpp index 318e96ea35468..b1fff27a55f91 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft( // add drafted token for each sequence const llama_token id = cur_p->data[0].id; - // only collect very high-confidence draft tokens - if (cur_p->data[0].p < params.p_min) { - break; - } - common_sampler_accept(smpl, id, true); result.push_back(id); @@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft( break; } + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + common_batch_add(batch, id, n_past + i + 1, { 0 }, true); // evaluate the drafted tokens on the draft model diff --git a/common/speculative.h b/common/speculative.h index 2baf99fc78ce1..2b51a70ca1f72 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -9,7 +9,7 @@ struct common_speculative_params { int n_draft = 16; // max drafted tokens int n_reuse = 256; - float p_min = 0.9f; // min probability required to accept a token in the draft + float p_min = 0.75f; // min probability required to accept a token in the draft }; struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 018a2a588ae9d..8b7c75d85a6f5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -558,7 +558,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! - # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # ref: https://github.com/ggml-org/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that @@ -708,7 +708,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") logger.warning("**************************************************************************************") @@ -2835,7 +2835,7 @@ def set_vocab(self): if chat_eos_token_id is not None: # For the chat model, we replace the eos with '<|im_end|>'. # TODO: this is a hack, should be fixed - # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 + # https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" " in chat mode so that the conversation can end normally.") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index cea34413f441f..fa4989a80c544 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -8,7 +8,7 @@ # provide the necessary information to llama.cpp via the GGUF header in order to implement # the same pre-tokenizer. # -# ref: https://github.com/ggerganov/llama.cpp/pull/6920 +# ref: https://github.com/ggml-org/llama.cpp/pull/6920 # # Instructions: # @@ -246,7 +246,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") - logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("** ref: https://github.com/ggml-org/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") logger.warning("**************************************************************************************") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 6dea14a2329e8..bdc991533b4e0 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -395,7 +395,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") - logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948") + logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948") sys.exit(1) if base_name in tensor_map: @@ -419,7 +419,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # some archs may have the same tensor for lm_head and output (tie word embeddings) # in this case, adapters targeting lm_head will fail when using llama-export-lora # therefore, we ignore them for now - # see: https://github.com/ggerganov/llama.cpp/issues/9065 + # see: https://github.com/ggml-org/llama.cpp/issues/9065 if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: diff --git a/docs/android.md b/docs/android.md index 47530c6c1d478..d2a835653fe5d 100644 --- a/docs/android.md +++ b/docs/android.md @@ -12,7 +12,7 @@ $ apt update && apt upgrade -y $ apt install git cmake ``` -Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake. +Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake. Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance: diff --git a/docs/backend/OPENCL.md b/docs/backend/OPENCL.md index a604058cbeb97..2a946dc8df0ff 100644 --- a/docs/backend/OPENCL.md +++ b/docs/backend/OPENCL.md @@ -122,7 +122,7 @@ cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x ```sh cd ~/dev/llm -git clone https://github.com/ggerganov/llama.cpp && \ +git clone https://github.com/ggml-org/llama.cpp && \ cd llama.cpp && \ mkdir build-android && cd build-android @@ -182,7 +182,7 @@ cmake --build . --target install mkdir -p ~/dev/llm cd ~/dev/llm -git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp +git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp mkdir build && cd build cmake .. -G Ninja ` diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 89ddbd669afa0..5da439e94e092 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -36,12 +36,22 @@ The following release is verified with good quality: |Commit ID|Tag|Release|Verified Platform| Update date| |-|-|-|-|-| -|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News +- 2025.2 + - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC). + |GPU|Base tokens/s|Increased tokens/s|Percent| + |-|-|-|-| + |PVC 1550|39|73|+87%| + |Flex 170|39|50|+28%| + |Arc770|42|55|+30%| + |MTL|13|16|+23%| + |ARL-H|14|17|+21%| + - 2024.11 - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer. @@ -58,7 +68,7 @@ The following release is verified with good quality: - 2024.3 - Release binary files of Windows. - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). - - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). + - New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437). - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. - Support detecting all GPUs with level-zero and same top **Max compute units**. @@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family: | Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 | -| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | -| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 | +| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake | +| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -660,8 +670,10 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | + ## Known Issues - `Split-mode:[row]` is not supported. diff --git a/docs/build.md b/docs/build.md index afb7a040218db..b3ecf043d7e48 100644 --- a/docs/build.md +++ b/docs/build.md @@ -3,7 +3,7 @@ **To get the Code:** ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` @@ -46,7 +46,7 @@ cmake --build build --config Release ``` - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: - - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): + - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): - Tab Workload: Desktop-development with C++ - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test @@ -206,6 +206,14 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP cmake --build build --config Release ``` + For static build: + + ```bash + cmake -B build -DGGML_MUSA=ON \ + -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON + cmake --build build --config Release + ``` + The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used. The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md index 9c88b769415cf..75cd2b499d086 100644 --- a/docs/cuda-fedora.md +++ b/docs/cuda-fedora.md @@ -248,7 +248,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t - **Building `llama.cpp`:** - - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. + - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support. - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration. - **Using the Toolbox Environment:** diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 8fcd7081130f2..78c6f76077a2b 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/ ## GGUF specification -https://github.com/ggerganov/ggml/blob/master/docs/gguf.md +https://github.com/ggml-org/ggml/blob/master/docs/gguf.md ## Resources -- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 -- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 -- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 -- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 -- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 -- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 -- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 -- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 -- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 +- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268 +- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009 +- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283 +- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406 +- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423 +- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204 +- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491 +- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515 +- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948 diff --git a/docs/docker.md b/docs/docker.md index cab5ae9572349..343146dbd214f 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,21 +7,21 @@ ## Images We have three Docker images available for this project: -1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) -2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) -3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) +1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) +2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: -- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) -- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) -- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`) +- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now). @@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i Replace `/path/to/models` below with the actual path where you downloaded the models. ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B ``` On completion, you are ready to play! ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a light image: ```bash -docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` or with a server image: ```bash -docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 ``` ## Docker With CUDA @@ -69,7 +69,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment The defaults are: -- `CUDA_VERSION` set to `12.6.0` +- `CUDA_VERSION` set to `12.4.0` - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures The resulting images, are essentially the same as the non-CUDA images: diff --git a/docs/install.md b/docs/install.md index 10a568506835b..0e23a2c9e7ae1 100644 --- a/docs/install.md +++ b/docs/install.md @@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via ```sh brew install llama.cpp ``` -The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 +The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668 ## Nix diff --git a/docs/llguidance.md b/docs/llguidance.md index 792d20704036d..cda787b14de04 100644 --- a/docs/llguidance.md +++ b/docs/llguidance.md @@ -13,13 +13,15 @@ cmake -B build -DLLAMA_LLGUIDANCE=ON make -C build -j ``` +For Windows use `cmake --build build --config Release` instead of `make`. + This requires the Rust compiler and the `cargo` tool to be [installed](https://www.rust-lang.org/tools/install). ## Interface There are no new command-line arguments or modifications to `common_params`. When enabled, grammars starting with `%llguidance` are passed to LLGuidance instead of the [current](../grammars/README.md) llama.cpp grammars. Additionally, JSON Schema requests (e.g., using the `-j` argument in `llama-cli`) are also passed to LLGuidance. -For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/scripts/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format. +For your existing GBNF grammars, you can use [gbnf_to_lark.py script](https://github.com/guidance-ai/llguidance/blob/main/python/llguidance/gbnf_to_lark.py) to convert them to LLGuidance Lark-like format. ## Performance diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md index be4dd5250f15f..6d5fd74ad8ca0 100644 --- a/examples/cvector-generator/README.md +++ b/examples/cvector-generator/README.md @@ -3,9 +3,9 @@ This example demonstrates how to generate a control vector using gguf models. Related PRs: -- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970) -- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880) -- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514) +- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970) +- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880) +- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514) ## Examples diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 9c056986b3ed6..9aa2b20347927 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,7 +1,7 @@ # llama.cpp/examples/imatrix -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. -More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enhance the quality of the quantized models. +More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861 ## Usage diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 395e2aa47c301..4edc0bfacf125 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); // this has been adapted to the new format of storing merged experts in a single 3d tensor - // ref: https://github.com/ggerganov/llama.cpp/pull/6387 + // ref: https://github.com/ggml-org/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { // ids -> [n_experts_used, n_tokens] // src1 -> [cols, n_expert_used, n_tokens] diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt index 2de496574f54a..6119fe09b0cb6 100644 --- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt @@ -14,7 +14,7 @@ project("llama-android") #include(FetchContent) #FetchContent_Declare( # llama -# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp +# GIT_REPOSITORY https://github.com/ggml-org/llama.cpp # GIT_TAG master #) diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md index 96cf743d48202..f717886d661ce 100644 --- a/examples/llama.swiftui/README.md +++ b/examples/llama.swiftui/README.md @@ -3,9 +3,9 @@ Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting point for more advanced projects. -For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508 +For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508 -![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299) +![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299) Video demonstration: diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 30c2dc4310210..1c3cd9d2efc73 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -124,15 +124,26 @@ struct ContentView: View { } } }.sheet(isPresented: $showingHelp) { // Sheet for help modal - VStack(alignment: .leading) { + NavigationView { VStack(alignment: .leading) { - Text("1. Make sure the model is in GGUF Format") - .padding() - Text("2. Copy the download link of the quantized model") - .padding() + VStack(alignment: .leading) { + Text("1. Make sure the model is in GGUF Format") + .padding() + Text("2. Copy the download link of the quantized model") + .padding() + } + Spacer() + } + .navigationTitle("Help") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Done") { + showingHelp = false + } + } } - Spacer() - } + } } } } diff --git a/examples/llama.vim b/examples/llama.vim index 57eb2a9772d51..af3fd3935d765 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -39,7 +39,7 @@ " " :call llama#init() " -" more info: https://github.com/ggerganov/llama.cpp/pull/9787 +" more info: https://github.com/ggml-org/llama.cpp/pull/9787 " " colors (adjust to your liking) diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md index 8713a43d64fd8..8f591506dbbb0 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/examples/llava/README-minicpmo2.6.md @@ -26,7 +26,7 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model ``` Build llama.cpp using `CMake`: -https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md +https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md ```bash cmake -B build diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index 1c8498ff9e151..b0e72a0fa7a78 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -6,7 +6,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V- Clone llama.cpp: ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 271cf2a2ac1ad..e2150c06d3204 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1729,11 +1729,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { } } -static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { img->nx = nx; img->ny = ny; img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), data, img->buf.size()); + memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { @@ -1743,7 +1743,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); return false; } - build_clip_img_from_data(data, nx, ny, img); + clip_build_img_from_pixels(data, nx, ny, img); stbi_image_free(data); return true; } @@ -1755,7 +1755,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length LOG_ERR("%s: failed to decode image bytes\n", __func__); return false; } - build_clip_img_from_data(data, nx, ny, img); + clip_build_img_from_pixels(data, nx, ny, img); stbi_image_free(data); return true; } @@ -2712,9 +2712,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!ctx->has_glm_projector) { struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + // The patches vector is used to get rows to index into the embeds with; + // we should skip dim 0 only if we have CLS to avoid going out of bounds + // when retrieving the rows. + int patch_offset = ctx->has_class_embedding ? 1 : 0; int* patches_data = (int*)malloc(ggml_nbytes(patches)); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + patches_data[i] = i + patch_offset; } ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); free(patches_data); diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 841b4f6f9098f..f557122008ff1 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -73,6 +73,9 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); +/** build image from pixels decoded by other libraries instead of stb_image.h for better performance. The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes */ +CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img); + CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md index a69a471b47d39..aab3cd0ca49b9 100644 --- a/examples/lookahead/README.md +++ b/examples/lookahead/README.md @@ -4,4 +4,4 @@ Demonstration of lookahead decoding technique: https://lmsys.org/blog/2023-11-21-lookahead-decoding/ -More info: https://github.com/ggerganov/llama.cpp/pull/4207 +More info: https://github.com/ggml-org/llama.cpp/pull/4207 diff --git a/examples/lookup/README.md b/examples/lookup/README.md index 71c345c037a2f..07d73849b0601 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft More info: -https://github.com/ggerganov/llama.cpp/pull/4484 -https://github.com/ggerganov/llama.cpp/issues/4226 +https://github.com/ggml-org/llama.cpp/pull/4484 +https://github.com/ggml-org/llama.cpp/issues/4226 diff --git a/examples/main/README.md b/examples/main/README.md index ea71591bd939d..f7c2497294ab5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/main -This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. +This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts. ## Table of Contents @@ -121,7 +121,7 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t ### Chat templates - `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled. + `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled. Example usage: `--chat-template gemma` diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e654d3542c6c3..cf8659b037ee3 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -4,7 +4,7 @@ #include "log.h" #include "sampling.h" #include "llama.h" -#include "chat-template.hpp" +#include "chat.h" #include #include @@ -158,7 +158,7 @@ int main(int argc, char ** argv) { } const llama_vocab * vocab = llama_model_get_vocab(model); - auto chat_templates = common_chat_templates_from_model(model, params.chat_template); + auto chat_templates = common_chat_templates_init(model, params.chat_template); LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); @@ -201,7 +201,7 @@ int main(int argc, char ** argv) { } // auto enable conversation mode if chat template is available - const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default; + const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get()); if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) { if (has_chat_template) { LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__); @@ -219,7 +219,7 @@ int main(int argc, char ** argv) { // print chat template example in conversation mode if (params.conversation_mode) { if (params.enable_chat_template) { - LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str()); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str()); } else { LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); } @@ -264,9 +264,11 @@ int main(int argc, char ** argv) { std::vector embd_inp; auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { - common_chat_msg new_msg{role, content, {}}; - auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja); - chat_msgs.push_back({role, content, {}}); + common_chat_msg new_msg; + new_msg.role = role; + new_msg.content = content; + auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja); + chat_msgs.push_back(new_msg); LOG_DBG("formatted: '%s'\n", formatted.c_str()); return formatted; }; @@ -755,11 +757,14 @@ int main(int argc, char ** argv) { // check for reverse prompt using special tokens llama_token last_token = common_sampler_last(smpl); - if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) { - if (params.interactive) { - is_interacting = true; + for (auto token : antiprompt_token) { + if (token == last_token) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; } - is_antiprompt = true; } if (is_antiprompt) { diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 2b8e910f9658d..2f19597c48d7f 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -5,8 +5,8 @@ models ability to recall information from long contexts. See the following PRs for more info: -- https://github.com/ggerganov/llama.cpp/pull/3856 -- https://github.com/ggerganov/llama.cpp/pull/4810 +- https://github.com/ggml-org/llama.cpp/pull/3856 +- https://github.com/ggml-org/llama.cpp/pull/4810 ### Usage diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py index eb000d5ccba24..f94b82ca47570 100755 --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar): """Calls the /completion API on llama-server. See - https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints """ print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} diff --git a/examples/quantize/README.md b/examples/quantize/README.md index f9cce7b213334..992d00e21b4fe 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -69,22 +69,22 @@ Several quantization methods are supported. They differ in the resulting model d | 13B | ms/tok @ 8th | - | 73 | 82 | 98 | 105 | 128 | | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | -- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) +- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684) - recent k-quants improvements and new i-quants - - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) - - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) - - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773) - - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856) - - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861) - - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872) - - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897) - - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) + - [#2707](https://github.com/ggml-org/llama.cpp/pull/2707) + - [#2807](https://github.com/ggml-org/llama.cpp/pull/2807) + - [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773) + - [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856) + - [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861) + - [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872) + - [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897) + - [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930) + - [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957) + - [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969) + - [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996) + - [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060) + - [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196) + - [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361) **Llama 2 7B** diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md index bc5f22e2ff156..6938a1e96ee35 100644 --- a/examples/retrieval/README.md +++ b/examples/retrieval/README.md @@ -3,7 +3,7 @@ Demonstration of simple retrieval technique based on cosine similarity More info: -https://github.com/ggerganov/llama.cpp/pull/6193 +https://github.com/ggml-org/llama.cpp/pull/6193 ### How to use diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 9362da22083d3..38407d5190923 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -24,7 +24,7 @@ #include #include -#include "chat-template.hpp" +#include "chat.h" #include "common.h" #include "json.hpp" #include "linenoise.cpp/linenoise.h" @@ -113,6 +113,7 @@ class Opt { llama_context_params ctx_params; llama_model_params model_params; std::string model_; + std::string chat_template_file; std::string user; bool use_jinja = false; int context_size = -1, ngl = -1; @@ -148,6 +149,16 @@ class Opt { return 0; } + int handle_option_with_value(int argc, const char ** argv, int & i, std::string & option_value) { + if (i + 1 >= argc) { + return 1; + } + + option_value = argv[++i]; + + return 0; + } + int parse(int argc, const char ** argv) { bool options_parsing = true; for (int i = 1, positional_args_i = 0; i < argc; ++i) { @@ -169,6 +180,11 @@ class Opt { verbose = true; } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { use_jinja = true; + } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){ + if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { + return 1; + } + use_jinja = true; } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { help = true; return 0; @@ -207,6 +223,11 @@ class Opt { "Options:\n" " -c, --context-size \n" " Context size (default: %d)\n" + " --chat-template-file \n" + " Path to the file containing the chat template to use with the model.\n" + " Only supports jinja templates and implicitly sets the --jinja flag.\n" + " --jinja\n" + " Use jinja templating for the chat template of the model\n" " -n, -ngl, --ngl \n" " Number of GPU layers (default: %d)\n" " --temp \n" @@ -261,13 +282,12 @@ static int get_terminal_width() { #endif } -#ifdef LLAMA_USE_CURL class File { public: FILE * file = nullptr; FILE * open(const std::string & filename, const char * mode) { - file = fopen(filename.c_str(), mode); + file = ggml_fopen(filename.c_str(), mode); return file; } @@ -303,6 +323,20 @@ class File { return 0; } + std::string to_string() { + fseek(file, 0, SEEK_END); + const size_t size = ftell(file); + fseek(file, 0, SEEK_SET); + std::string out; + out.resize(size); + const size_t read_size = fread(&out[0], 1, size, file); + if (read_size != size) { + printe("Error reading file: %s", strerror(errno)); + } + + return out; + } + ~File() { if (fd >= 0) { # ifdef _WIN32 @@ -327,6 +361,7 @@ class File { # endif }; +#ifdef LLAMA_USE_CURL class HttpClient { public: int init(const std::string & url, const std::vector & headers, const std::string & output_file, @@ -557,7 +592,7 @@ class LlamaData { llama_model_ptr model; llama_sampler_ptr sampler; llama_context_ptr context; - std::vector messages; + std::vector messages; // TODO: switch to common_chat_msg std::list msg_strs; std::vector fmtted; @@ -834,44 +869,23 @@ static void add_message(const char * role, const std::string & text, LlamaData & } // Function to apply the chat template and resize `formatted` if needed -static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) { - if (use_jinja) { - json messages = json::array(); - for (const auto & msg : llama_data.messages) { - messages.push_back({ - {"role", msg.role}, - {"content", msg.content}, - }); - } - try { - minja::chat_template_inputs tmpl_inputs; - tmpl_inputs.messages = messages; - tmpl_inputs.add_generation_prompt = append; - - minja::chat_template_options tmpl_opts; - tmpl_opts.use_bos_token = false; - tmpl_opts.use_eos_token = false; - - auto result = tmpl.apply(tmpl_inputs, tmpl_opts); - llama_data.fmtted.resize(result.size() + 1); - memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1); - return result.size(); - } catch (const std::exception & e) { - printe("failed to render the chat template: %s\n", e.what()); - return -1; - } - } - int result = llama_chat_apply_template( - tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append, - append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); - if (append && result > static_cast(llama_data.fmtted.size())) { - llama_data.fmtted.resize(result); - result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(), - llama_data.messages.size(), append, llama_data.fmtted.data(), - llama_data.fmtted.size()); - } - - return result; +static int apply_chat_template(const struct common_chat_templates * tmpls, LlamaData & llama_data, const bool append, bool use_jinja) { + common_chat_templates_inputs inputs; + for (const auto & msg : llama_data.messages) { + common_chat_msg cmsg; + cmsg.role = msg.role; + cmsg.content = msg.content; + inputs.messages.push_back(cmsg); + } + inputs.add_generation_prompt = append; + inputs.use_jinja = use_jinja; + + auto chat_params = common_chat_templates_apply(tmpls, inputs); + // TODO: use other params for tool calls. + auto result = chat_params.prompt; + llama_data.fmtted.resize(result.size() + 1); + memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1); + return result.size(); } // Function to tokenize the prompt @@ -963,7 +977,8 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str } static int read_user_input(std::string & user_input) { - static const char * prompt_prefix = "> "; + static const char * prompt_prefix_env = std::getenv("LLAMA_PROMPT_PREFIX"); + static const char * prompt_prefix = prompt_prefix_env ? prompt_prefix_env : "> "; #ifdef WIN32 printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix); @@ -1015,8 +1030,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt, } // Helper function to apply the chat template and handle errors -static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) { - const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja); +static int apply_chat_template_with_error_handling(const common_chat_templates * tmpls, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) { + const int new_len = apply_chat_template(tmpls, llama_data, append, use_jinja); if (new_len < 0) { printe("failed to apply the chat template\n"); return -1; @@ -1074,40 +1089,68 @@ static int get_user_input(std::string & user_input, const std::string & user) { return 0; } +// Reads a chat template file to be used +static std::string read_chat_template_file(const std::string & chat_template_file) { + File file; + if (!file.open(chat_template_file, "r")) { + printe("Error opening chat template file '%s': %s", chat_template_file.c_str(), strerror(errno)); + return ""; + } + + return file.to_string(); +} + +static int process_user_message(const Opt & opt, const std::string & user_input, LlamaData & llama_data, + const common_chat_templates_ptr & chat_templates, int & prev_len, + const bool stdout_a_terminal) { + add_message("user", opt.user.empty() ? user_input : opt.user, llama_data); + int new_len; + if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, opt.use_jinja) < 0) { + return 1; + } + + std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); + std::string response; + if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { + return 1; + } + + if (!opt.user.empty()) { + return 2; + } + + add_message("assistant", response, llama_data); + if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, opt.use_jinja) < 0) { + return 1; + } + + return 0; +} + // Main chat loop function -static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) { +static int chat_loop(LlamaData & llama_data, const Opt & opt) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); - auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), ""); - GGML_ASSERT(chat_templates.template_default); + std::string chat_template; + if (!opt.chat_template_file.empty()) { + chat_template = read_chat_template_file(opt.chat_template_file); + } + + common_chat_templates_ptr chat_templates = common_chat_templates_init(llama_data.model.get(), chat_template); static const bool stdout_a_terminal = is_stdout_a_terminal(); while (true) { // Get user input std::string user_input; - if (get_user_input(user_input, user) == 1) { + if (get_user_input(user_input, opt.user) == 1) { return 0; } - add_message("user", user.empty() ? user_input : user, llama_data); - int new_len; - if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) { - return 1; - } - - std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); - std::string response; - if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { + const int ret = process_user_message(opt, user_input, llama_data, chat_templates, prev_len, stdout_a_terminal); + if (ret == 1) { return 1; - } - - if (!user.empty()) { + } else if (ret == 2) { break; } - - add_message("assistant", response, llama_data); - if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) { - return 1; - } } return 0; @@ -1165,7 +1208,7 @@ int main(int argc, const char ** argv) { return 1; } - if (chat_loop(llama_data, opt.user, opt.use_jinja)) { + if (chat_loop(llama_data, opt)) { return 1; } diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 1b7cc8c1328e4..aee90388e4fb3 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) if (MINGW) - # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 + # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006 add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER}) endif() diff --git a/examples/server/README.md b/examples/server/README.md index 751d4db9ee9d7..a2ae614d7c489 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -7,14 +7,14 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes - * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510) + * Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510) * Parallel decoding with multi-user support * Continuous batching * Multimodal (wip) * Monitoring endpoints * Schema-constrained JSON response format -The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). +The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216). ## Usage @@ -65,7 +65,7 @@ The project is under active development, and we are [looking for feedback and co | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | -| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | +| `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | @@ -178,7 +178,7 @@ Example usage of docker compose with environment variables: ```yml services: llamacpp-server: - image: ghcr.io/ggerganov/llama.cpp:server + image: ghcr.io/ggml-org/llama.cpp:server ports: - 8080:8080 volumes: @@ -273,10 +273,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can ### Docker ```bash -docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 +docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 # or, with CUDA: -docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 +docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 ``` ## Testing with CURL @@ -1066,7 +1066,7 @@ print(completion.choices[0].text) ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API -Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* @@ -1120,7 +1120,7 @@ curl http://localhost:8080/v1/chat/completions \ *Tool call support* -[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639): +[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639): - Requires `--jinja` flag - Native tool call formats supported: @@ -1599,7 +1599,7 @@ Apart from error types supported by OAI, we also have custom types that are spec ### Legacy completion web UI -A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy` +A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy` For example: diff --git a/examples/server/httplib.h b/examples/server/httplib.h index c2f12dd2ac8d6..593beb50150b1 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -1,14 +1,14 @@ // // httplib.h // -// Copyright (c) 2024 Yuji Hirose. All rights reserved. +// Copyright (c) 2025 Yuji Hirose. All rights reserved. // MIT License // #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.18.5" +#define CPPHTTPLIB_VERSION "0.19.0" /* * Configuration @@ -66,6 +66,10 @@ #define CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND 0 #endif +#ifndef CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND +#define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0 +#endif + #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND #define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0 #endif @@ -189,6 +193,7 @@ using ssize_t = long; #endif using socket_t = SOCKET; +using socklen_t = int; #ifdef CPPHTTPLIB_USE_POLL #define poll(fds, nfds, timeout) WSAPoll(fds, nfds, timeout) #endif @@ -218,7 +223,9 @@ using socket_t = SOCKET; #include #include #include +#ifndef __VMS #include +#endif #include #include #include @@ -662,6 +669,8 @@ struct Request { ContentProvider content_provider_; bool is_chunked_content_provider_ = false; size_t authorization_count_ = 0; + std::chrono::time_point start_time_ = + std::chrono::steady_clock::time_point::min(); }; struct Response { @@ -735,6 +744,8 @@ class Stream { virtual void get_local_ip_and_port(std::string &ip, int &port) const = 0; virtual socket_t socket() const = 0; + virtual time_t duration() const = 0; + ssize_t write(const char *ptr); ssize_t write(const std::string &s); }; @@ -838,6 +849,16 @@ using Logger = std::function; using SocketOptions = std::function; +namespace detail { + +bool set_socket_opt_impl(socket_t sock, int level, int optname, + const void *optval, socklen_t optlen); +bool set_socket_opt(socket_t sock, int level, int optname, int opt); +bool set_socket_opt_time(socket_t sock, int level, int optname, time_t sec, + time_t usec); + +} // namespace detail + void default_socket_options(socket_t sock); const char *status_message(int status); @@ -1421,6 +1442,10 @@ class ClientImpl { template void set_write_timeout(const std::chrono::duration &duration); + void set_max_timeout(time_t msec); + template + void set_max_timeout(const std::chrono::duration &duration); + void set_basic_auth(const std::string &username, const std::string &password); void set_bearer_token_auth(const std::string &token); #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -1529,6 +1554,7 @@ class ClientImpl { time_t read_timeout_usec_ = CPPHTTPLIB_CLIENT_READ_TIMEOUT_USECOND; time_t write_timeout_sec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_SECOND; time_t write_timeout_usec_ = CPPHTTPLIB_CLIENT_WRITE_TIMEOUT_USECOND; + time_t max_timeout_msec_ = CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND; std::string basic_auth_username_; std::string basic_auth_password_; @@ -1583,9 +1609,6 @@ class ClientImpl { bool send_(Request &req, Response &res, Error &error); Result send_(Request &&req); -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT - bool is_ssl_peer_could_be_closed(SSL *ssl) const; -#endif socket_t create_client_socket(Error &error) const; bool read_response_line(Stream &strm, const Request &req, Response &res) const; @@ -1611,8 +1634,10 @@ class ClientImpl { std::string adjust_host_string(const std::string &host) const; - virtual bool process_socket(const Socket &socket, - std::function callback); + virtual bool + process_socket(const Socket &socket, + std::chrono::time_point start_time, + std::function callback); virtual bool is_ssl() const; }; @@ -1854,6 +1879,10 @@ class Client { template void set_write_timeout(const std::chrono::duration &duration); + void set_max_timeout(time_t msec); + template + void set_max_timeout(const std::chrono::duration &duration); + void set_basic_auth(const std::string &username, const std::string &password); void set_bearer_token_auth(const std::string &token); #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -1971,12 +2000,16 @@ class SSLClient final : public ClientImpl { void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override; void shutdown_ssl_impl(Socket &socket, bool shutdown_gracefully); - bool process_socket(const Socket &socket, - std::function callback) override; + bool + process_socket(const Socket &socket, + std::chrono::time_point start_time, + std::function callback) override; bool is_ssl() const override; - bool connect_with_proxy(Socket &sock, Response &res, bool &success, - Error &error); + bool connect_with_proxy( + Socket &sock, + std::chrono::time_point start_time, + Response &res, bool &success, Error &error); bool initialize_ssl(Socket &socket, Error &error); bool load_certs(); @@ -2053,20 +2086,45 @@ inline uint64_t Response::get_header_value_u64(const std::string &key, return detail::get_header_value_u64(headers, key, def, id); } -inline void default_socket_options(socket_t sock) { - int opt = 1; +namespace detail { + +inline bool set_socket_opt_impl(socket_t sock, int level, int optname, + const void *optval, socklen_t optlen) { + return setsockopt(sock, level, optname, #ifdef _WIN32 - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&opt), sizeof(opt)); + reinterpret_cast(optval), #else -#ifdef SO_REUSEPORT - setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - reinterpret_cast(&opt), sizeof(opt)); + optval, +#endif + optlen) == 0; +} + +inline bool set_socket_opt(socket_t sock, int level, int optname, int optval) { + return set_socket_opt_impl(sock, level, optname, &optval, sizeof(optval)); +} + +inline bool set_socket_opt_time(socket_t sock, int level, int optname, + time_t sec, time_t usec) { +#ifdef _WIN32 + auto timeout = static_cast(sec * 1000 + usec / 1000); #else - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - reinterpret_cast(&opt), sizeof(opt)); + timeval timeout; + timeout.tv_sec = static_cast(sec); + timeout.tv_usec = static_cast(usec); #endif + return set_socket_opt_impl(sock, level, optname, &timeout, sizeof(timeout)); +} + +} // namespace detail + +inline void default_socket_options(socket_t sock) { + detail::set_socket_opt(sock, SOL_SOCKET, +#ifdef SO_REUSEPORT + SO_REUSEPORT, +#else + SO_REUSEADDR, #endif + 1); } inline const char *status_message(int status) { @@ -2238,6 +2296,14 @@ inline void ClientImpl::set_write_timeout( duration, [&](time_t sec, time_t usec) { set_write_timeout(sec, usec); }); } +template +inline void ClientImpl::set_max_timeout( + const std::chrono::duration &duration) { + auto msec = + std::chrono::duration_cast(duration).count(); + set_max_timeout(msec); +} + template inline void Client::set_connection_timeout( const std::chrono::duration &duration) { @@ -2256,6 +2322,12 @@ Client::set_write_timeout(const std::chrono::duration &duration) { cli_->set_write_timeout(duration); } +template +inline void +Client::set_max_timeout(const std::chrono::duration &duration) { + cli_->set_max_timeout(duration); +} + /* * Forward declarations and types that will be part of the .h file if split into * .h + .cc. @@ -2330,10 +2402,12 @@ void split(const char *b, const char *e, char d, void split(const char *b, const char *e, char d, size_t m, std::function fn); -bool process_client_socket(socket_t sock, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, - std::function callback); +bool process_client_socket( + socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec, + std::chrono::time_point start_time, + std::function callback); socket_t create_client_socket(const std::string &host, const std::string &ip, int port, int address_family, bool tcp_nodelay, @@ -2381,6 +2455,7 @@ class BufferStream final : public Stream { void get_remote_ip_and_port(std::string &ip, int &port) const override; void get_local_ip_and_port(std::string &ip, int &port) const override; socket_t socket() const override; + time_t duration() const override; const std::string &get_buffer() const; @@ -2547,7 +2622,7 @@ inline bool is_obs_text(char c) { return 128 <= static_cast(c); } inline bool is_field_vchar(char c) { return is_vchar(c) || is_obs_text(c); } inline bool is_field_content(const std::string &s) { - if (s.empty()) { return false; } + if (s.empty()) { return true; } if (s.size() == 1) { return is_field_vchar(s[0]); @@ -3171,60 +3246,43 @@ inline ssize_t send_socket(socket_t sock, const void *ptr, size_t size, }); } -inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) { +template +inline ssize_t select_impl(socket_t sock, time_t sec, time_t usec) { #ifdef CPPHTTPLIB_USE_POLL - struct pollfd pfd_read; - pfd_read.fd = sock; - pfd_read.events = POLLIN; + struct pollfd pfd; + pfd.fd = sock; + pfd.events = (Read ? POLLIN : POLLOUT); auto timeout = static_cast(sec * 1000 + usec / 1000); - return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); }); + return handle_EINTR([&]() { return poll(&pfd, 1, timeout); }); #else #ifndef _WIN32 if (sock >= FD_SETSIZE) { return -1; } #endif - fd_set fds; + fd_set fds, *rfds, *wfds; FD_ZERO(&fds); FD_SET(sock, &fds); + rfds = (Read ? &fds : nullptr); + wfds = (Read ? nullptr : &fds); timeval tv; tv.tv_sec = static_cast(sec); tv.tv_usec = static_cast(usec); return handle_EINTR([&]() { - return select(static_cast(sock + 1), &fds, nullptr, nullptr, &tv); + return select(static_cast(sock + 1), rfds, wfds, nullptr, &tv); }); #endif } -inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) { -#ifdef CPPHTTPLIB_USE_POLL - struct pollfd pfd_read; - pfd_read.fd = sock; - pfd_read.events = POLLOUT; - - auto timeout = static_cast(sec * 1000 + usec / 1000); - - return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); }); -#else -#ifndef _WIN32 - if (sock >= FD_SETSIZE) { return -1; } -#endif - - fd_set fds; - FD_ZERO(&fds); - FD_SET(sock, &fds); - - timeval tv; - tv.tv_sec = static_cast(sec); - tv.tv_usec = static_cast(usec); +inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) { + return select_impl(sock, sec, usec); +} - return handle_EINTR([&]() { - return select(static_cast(sock + 1), nullptr, &fds, nullptr, &tv); - }); -#endif +inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) { + return select_impl(sock, sec, usec); } inline Error wait_until_socket_is_ready(socket_t sock, time_t sec, @@ -3298,7 +3356,10 @@ inline bool is_socket_alive(socket_t sock) { class SocketStream final : public Stream { public: SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, - time_t write_timeout_sec, time_t write_timeout_usec); + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec = 0, + std::chrono::time_point start_time = + std::chrono::steady_clock::time_point::min()); ~SocketStream() override; bool is_readable() const override; @@ -3308,6 +3369,7 @@ class SocketStream final : public Stream { void get_remote_ip_and_port(std::string &ip, int &port) const override; void get_local_ip_and_port(std::string &ip, int &port) const override; socket_t socket() const override; + time_t duration() const override; private: socket_t sock_; @@ -3315,6 +3377,8 @@ class SocketStream final : public Stream { time_t read_timeout_usec_; time_t write_timeout_sec_; time_t write_timeout_usec_; + time_t max_timeout_msec_; + const std::chrono::time_point start_time; std::vector read_buff_; size_t read_buff_off_ = 0; @@ -3326,9 +3390,12 @@ class SocketStream final : public Stream { #ifdef CPPHTTPLIB_OPENSSL_SUPPORT class SSLSocketStream final : public Stream { public: - SSLSocketStream(socket_t sock, SSL *ssl, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec); + SSLSocketStream( + socket_t sock, SSL *ssl, time_t read_timeout_sec, + time_t read_timeout_usec, time_t write_timeout_sec, + time_t write_timeout_usec, time_t max_timeout_msec = 0, + std::chrono::time_point start_time = + std::chrono::steady_clock::time_point::min()); ~SSLSocketStream() override; bool is_readable() const override; @@ -3338,6 +3405,7 @@ class SSLSocketStream final : public Stream { void get_remote_ip_and_port(std::string &ip, int &port) const override; void get_local_ip_and_port(std::string &ip, int &port) const override; socket_t socket() const override; + time_t duration() const override; private: socket_t sock_; @@ -3346,6 +3414,8 @@ class SSLSocketStream final : public Stream { time_t read_timeout_usec_; time_t write_timeout_sec_; time_t write_timeout_usec_; + time_t max_timeout_msec_; + const std::chrono::time_point start_time; }; #endif @@ -3416,13 +3486,15 @@ process_server_socket(const std::atomic &svr_sock, socket_t sock, }); } -inline bool process_client_socket(socket_t sock, time_t read_timeout_sec, - time_t read_timeout_usec, - time_t write_timeout_sec, - time_t write_timeout_usec, - std::function callback) { +inline bool process_client_socket( + socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec, + std::chrono::time_point start_time, + std::function callback) { SocketStream strm(sock, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec); + write_timeout_sec, write_timeout_usec, max_timeout_msec, + start_time); return callback(strm); } @@ -3569,26 +3641,10 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, } #endif - if (tcp_nodelay) { - auto opt = 1; -#ifdef _WIN32 - setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&opt), sizeof(opt)); -#else - setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, - reinterpret_cast(&opt), sizeof(opt)); -#endif - } + if (tcp_nodelay) { set_socket_opt(sock, IPPROTO_TCP, TCP_NODELAY, 1); } if (rp->ai_family == AF_INET6) { - auto opt = ipv6_v6only ? 1 : 0; -#ifdef _WIN32 - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&opt), sizeof(opt)); -#else - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - reinterpret_cast(&opt), sizeof(opt)); -#endif + set_socket_opt(sock, IPPROTO_IPV6, IPV6_V6ONLY, ipv6_v6only ? 1 : 0); } if (socket_options) { socket_options(sock); } @@ -3731,36 +3787,10 @@ inline socket_t create_client_socket( } set_nonblocking(sock2, false); - - { -#ifdef _WIN32 - auto timeout = static_cast(read_timeout_sec * 1000 + - read_timeout_usec / 1000); - setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&timeout), sizeof(timeout)); -#else - timeval tv; - tv.tv_sec = static_cast(read_timeout_sec); - tv.tv_usec = static_cast(read_timeout_usec); - setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&tv), sizeof(tv)); -#endif - } - { - -#ifdef _WIN32 - auto timeout = static_cast(write_timeout_sec * 1000 + - write_timeout_usec / 1000); - setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, - reinterpret_cast(&timeout), sizeof(timeout)); -#else - timeval tv; - tv.tv_sec = static_cast(write_timeout_sec); - tv.tv_usec = static_cast(write_timeout_usec); - setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, - reinterpret_cast(&tv), sizeof(tv)); -#endif - } + set_socket_opt_time(sock2, SOL_SOCKET, SO_RCVTIMEO, read_timeout_sec, + read_timeout_usec); + set_socket_opt_time(sock2, SOL_SOCKET, SO_SNDTIMEO, write_timeout_sec, + write_timeout_usec); error = Error::Success; return true; @@ -4212,22 +4242,21 @@ inline bool parse_header(const char *beg, const char *end, T fn) { if (!key_len) { return false; } auto key = std::string(beg, key_end); - auto val = case_ignore::equal(key, "Location") - ? std::string(p, end) - : decode_url(std::string(p, end), false); - - // NOTE: From RFC 9110: - // Field values containing CR, LF, or NUL characters are - // invalid and dangerous, due to the varying ways that - // implementations might parse and interpret those - // characters; a recipient of CR, LF, or NUL within a field - // value MUST either reject the message or replace each of - // those characters with SP before further processing or - // forwarding of that message. - static const std::string CR_LF_NUL("\r\n\0", 3); - if (val.find_first_of(CR_LF_NUL) != std::string::npos) { return false; } - - fn(key, val); + // auto val = (case_ignore::equal(key, "Location") || + // case_ignore::equal(key, "Referer")) + // ? std::string(p, end) + // : decode_url(std::string(p, end), false); + auto val = std::string(p, end); + + if (!detail::fields::is_field_value(val)) { return false; } + + if (case_ignore::equal(key, "Location") || + case_ignore::equal(key, "Referer")) { + fn(key, val); + } else { + fn(key, decode_url(val, false)); + } + return true; } @@ -4263,7 +4292,7 @@ inline bool read_headers(Stream &strm, Headers &headers) { auto end = line_reader.ptr() + line_reader.size() - line_terminator_len; if (!parse_header(line_reader.ptr(), end, - [&](const std::string &key, std::string &val) { + [&](const std::string &key, const std::string &val) { headers.emplace(key, val); })) { return false; @@ -4312,7 +4341,7 @@ inline bool read_content_without_length(Stream &strm, uint64_t r = 0; for (;;) { auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ); - if (n <= 0) { return true; } + if (n <= 0) { return false; } if (!out(buf, static_cast(n), r, 0)) { return false; } r += static_cast(n); @@ -4456,9 +4485,9 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, ret = read_content_without_length(strm, out); } else { auto is_invalid_value = false; - auto len = get_header_value_u64(x.headers, "Content-Length", - std::numeric_limits::max(), - 0, is_invalid_value); + auto len = get_header_value_u64( + x.headers, "Content-Length", + (std::numeric_limits::max)(), 0, is_invalid_value); if (is_invalid_value) { ret = false; @@ -5380,10 +5409,14 @@ write_multipart_ranges_data(Stream &strm, const Request &req, Response &res, inline bool expect_content(const Request &req) { if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" || - req.method == "PRI" || req.method == "DELETE") { + req.method == "DELETE") { + return true; + } + if (req.has_header("Content-Length") && + req.get_header_value_u64("Content-Length") > 0) { return true; } - // TODO: check if Content-Length is set + if (is_chunked_transfer_encoding(req.headers)) { return true; } return false; } @@ -5428,9 +5461,76 @@ inline std::string SHA_256(const std::string &s) { inline std::string SHA_512(const std::string &s) { return message_digest(s, EVP_sha512()); } -#endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT +inline std::pair make_digest_authentication_header( + const Request &req, const std::map &auth, + size_t cnonce_count, const std::string &cnonce, const std::string &username, + const std::string &password, bool is_proxy = false) { + std::string nc; + { + std::stringstream ss; + ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count; + nc = ss.str(); + } + + std::string qop; + if (auth.find("qop") != auth.end()) { + qop = auth.at("qop"); + if (qop.find("auth-int") != std::string::npos) { + qop = "auth-int"; + } else if (qop.find("auth") != std::string::npos) { + qop = "auth"; + } else { + qop.clear(); + } + } + + std::string algo = "MD5"; + if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); } + + std::string response; + { + auto H = algo == "SHA-256" ? detail::SHA_256 + : algo == "SHA-512" ? detail::SHA_512 + : detail::MD5; + + auto A1 = username + ":" + auth.at("realm") + ":" + password; + + auto A2 = req.method + ":" + req.path; + if (qop == "auth-int") { A2 += ":" + H(req.body); } + + if (qop.empty()) { + response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2)); + } else { + response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce + + ":" + qop + ":" + H(A2)); + } + } + + auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : ""; + + auto field = "Digest username=\"" + username + "\", realm=\"" + + auth.at("realm") + "\", nonce=\"" + auth.at("nonce") + + "\", uri=\"" + req.path + "\", algorithm=" + algo + + (qop.empty() ? ", response=\"" + : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" + + cnonce + "\", response=\"") + + response + "\"" + + (opaque.empty() ? "" : ", opaque=\"" + opaque + "\""); + + auto key = is_proxy ? "Proxy-Authorization" : "Authorization"; + return std::make_pair(key, field); +} + +inline bool is_ssl_peer_could_be_closed(SSL *ssl, socket_t sock) { + detail::set_nonblocking(sock, true); + auto se = detail::scope_exit([&]() { detail::set_nonblocking(sock, false); }); + + char buf[1]; + return !SSL_peek(ssl, buf, 1) && + SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN; +} + #ifdef _WIN32 // NOTE: This code came up with the following stackoverflow post: // https://stackoverflow.com/questions/9507184/can-openssl-on-windows-use-the-system-certificate-store @@ -5569,68 +5669,6 @@ class WSInit { static WSInit wsinit_; #endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline std::pair make_digest_authentication_header( - const Request &req, const std::map &auth, - size_t cnonce_count, const std::string &cnonce, const std::string &username, - const std::string &password, bool is_proxy = false) { - std::string nc; - { - std::stringstream ss; - ss << std::setfill('0') << std::setw(8) << std::hex << cnonce_count; - nc = ss.str(); - } - - std::string qop; - if (auth.find("qop") != auth.end()) { - qop = auth.at("qop"); - if (qop.find("auth-int") != std::string::npos) { - qop = "auth-int"; - } else if (qop.find("auth") != std::string::npos) { - qop = "auth"; - } else { - qop.clear(); - } - } - - std::string algo = "MD5"; - if (auth.find("algorithm") != auth.end()) { algo = auth.at("algorithm"); } - - std::string response; - { - auto H = algo == "SHA-256" ? detail::SHA_256 - : algo == "SHA-512" ? detail::SHA_512 - : detail::MD5; - - auto A1 = username + ":" + auth.at("realm") + ":" + password; - - auto A2 = req.method + ":" + req.path; - if (qop == "auth-int") { A2 += ":" + H(req.body); } - - if (qop.empty()) { - response = H(H(A1) + ":" + auth.at("nonce") + ":" + H(A2)); - } else { - response = H(H(A1) + ":" + auth.at("nonce") + ":" + nc + ":" + cnonce + - ":" + qop + ":" + H(A2)); - } - } - - auto opaque = (auth.find("opaque") != auth.end()) ? auth.at("opaque") : ""; - - auto field = "Digest username=\"" + username + "\", realm=\"" + - auth.at("realm") + "\", nonce=\"" + auth.at("nonce") + - "\", uri=\"" + req.path + "\", algorithm=" + algo + - (qop.empty() ? ", response=\"" - : ", qop=" + qop + ", nc=" + nc + ", cnonce=\"" + - cnonce + "\", response=\"") + - response + "\"" + - (opaque.empty() ? "" : ", opaque=\"" + opaque + "\""); - - auto key = is_proxy ? "Proxy-Authorization" : "Authorization"; - return std::make_pair(key, field); -} -#endif - inline bool parse_www_authenticate(const Response &res, std::map &auth, bool is_proxy) { @@ -5949,20 +5987,45 @@ inline ssize_t Stream::write(const std::string &s) { namespace detail { +inline void calc_actual_timeout(time_t max_timeout_msec, + time_t duration_msec, time_t timeout_sec, + time_t timeout_usec, time_t &actual_timeout_sec, + time_t &actual_timeout_usec) { + auto timeout_msec = (timeout_sec * 1000) + (timeout_usec / 1000); + + auto actual_timeout_msec = + std::min(max_timeout_msec - duration_msec, timeout_msec); + + actual_timeout_sec = actual_timeout_msec / 1000; + actual_timeout_usec = (actual_timeout_msec % 1000) * 1000; +} + // Socket stream implementation -inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec, - time_t read_timeout_usec, - time_t write_timeout_sec, - time_t write_timeout_usec) +inline SocketStream::SocketStream( + socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec, + std::chrono::time_point start_time) : sock_(sock), read_timeout_sec_(read_timeout_sec), read_timeout_usec_(read_timeout_usec), write_timeout_sec_(write_timeout_sec), - write_timeout_usec_(write_timeout_usec), read_buff_(read_buff_size_, 0) {} + write_timeout_usec_(write_timeout_usec), + max_timeout_msec_(max_timeout_msec), start_time(start_time), + read_buff_(read_buff_size_, 0) {} inline SocketStream::~SocketStream() = default; inline bool SocketStream::is_readable() const { - return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; + if (max_timeout_msec_ <= 0) { + return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; + } + + time_t read_timeout_sec; + time_t read_timeout_usec; + calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_, + read_timeout_usec_, read_timeout_sec, read_timeout_usec); + + return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0; } inline bool SocketStream::is_writable() const { @@ -6039,6 +6102,12 @@ inline void SocketStream::get_local_ip_and_port(std::string &ip, inline socket_t SocketStream::socket() const { return sock_; } +inline time_t SocketStream::duration() const { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); +} + // Buffer stream implementation inline bool BufferStream::is_readable() const { return true; } @@ -6067,6 +6136,8 @@ inline void BufferStream::get_local_ip_and_port(std::string & /*ip*/, inline socket_t BufferStream::socket() const { return 0; } +inline time_t BufferStream::duration() const { return 0; } + inline const std::string &BufferStream::get_buffer() const { return buffer; } inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { @@ -6869,35 +6940,10 @@ inline bool Server::listen_internal() { break; } - { -#ifdef _WIN32 - auto timeout = static_cast(read_timeout_sec_ * 1000 + - read_timeout_usec_ / 1000); - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&timeout), sizeof(timeout)); -#else - timeval tv; - tv.tv_sec = static_cast(read_timeout_sec_); - tv.tv_usec = static_cast(read_timeout_usec_); - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&tv), sizeof(tv)); -#endif - } - { - -#ifdef _WIN32 - auto timeout = static_cast(write_timeout_sec_ * 1000 + - write_timeout_usec_ / 1000); - setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, - reinterpret_cast(&timeout), sizeof(timeout)); -#else - timeval tv; - tv.tv_sec = static_cast(write_timeout_sec_); - tv.tv_usec = static_cast(write_timeout_usec_); - setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, - reinterpret_cast(&tv), sizeof(tv)); -#endif - } + detail::set_socket_opt_time(sock, SOL_SOCKET, SO_RCVTIMEO, + read_timeout_sec_, read_timeout_usec_); + detail::set_socket_opt_time(sock, SOL_SOCKET, SO_SNDTIMEO, + write_timeout_sec_, write_timeout_usec_); if (!task_queue->enqueue( [this, sock]() { process_and_close_socket(sock); })) { @@ -7157,14 +7203,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr, #endif #endif - // Check if the request URI doesn't exceed the limit - if (line_reader.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) { - Headers dummy; - detail::read_headers(strm, dummy); - res.status = StatusCode::UriTooLong_414; - return write_response(strm, close_connection, req, res); - } - // Request line and headers if (!parse_request_line(line_reader.ptr(), req) || !detail::read_headers(strm, req.headers)) { @@ -7172,6 +7210,14 @@ Server::process_request(Stream &strm, const std::string &remote_addr, return write_response(strm, close_connection, req, res); } + // Check if the request URI doesn't exceed the limit + if (req.target.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) { + Headers dummy; + detail::read_headers(strm, dummy); + res.status = StatusCode::UriTooLong_414; + return write_response(strm, close_connection, req, res); + } + if (req.get_header_value("Connection") == "close") { connection_closed = true; } @@ -7363,6 +7409,7 @@ inline void ClientImpl::copy_settings(const ClientImpl &rhs) { read_timeout_usec_ = rhs.read_timeout_usec_; write_timeout_sec_ = rhs.write_timeout_sec_; write_timeout_usec_ = rhs.write_timeout_usec_; + max_timeout_msec_ = rhs.max_timeout_msec_; basic_auth_username_ = rhs.basic_auth_username_; basic_auth_password_ = rhs.basic_auth_password_; bearer_token_auth_token_ = rhs.bearer_token_auth_token_; @@ -7509,18 +7556,6 @@ inline bool ClientImpl::send(Request &req, Response &res, Error &error) { return ret; } -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT -inline bool ClientImpl::is_ssl_peer_could_be_closed(SSL *ssl) const { - detail::set_nonblocking(socket_.sock, true); - auto se = detail::scope_exit( - [&]() { detail::set_nonblocking(socket_.sock, false); }); - - char buf[1]; - return !SSL_peek(ssl, buf, 1) && - SSL_get_error(ssl, 0) == SSL_ERROR_ZERO_RETURN; -} -#endif - inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { { std::lock_guard guard(socket_mutex_); @@ -7535,7 +7570,9 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (is_alive && is_ssl()) { - if (is_ssl_peer_could_be_closed(socket_.ssl)) { is_alive = false; } + if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) { + is_alive = false; + } } #endif @@ -7560,7 +7597,8 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { auto &scli = static_cast(*this); if (!proxy_host_.empty() && proxy_port_ != -1) { auto success = false; - if (!scli.connect_with_proxy(socket_, res, success, error)) { + if (!scli.connect_with_proxy(socket_, req.start_time_, res, success, + error)) { return success; } } @@ -7606,7 +7644,7 @@ inline bool ClientImpl::send_(Request &req, Response &res, Error &error) { } }); - ret = process_socket(socket_, [&](Stream &strm) { + ret = process_socket(socket_, req.start_time_, [&](Stream &strm) { return handle_request(strm, req, res, close_connection, error); }); @@ -8015,6 +8053,9 @@ inline Result ClientImpl::send_with_content_provider( req.headers = headers; req.path = path; req.progress = progress; + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } auto error = Error::Success; @@ -8041,7 +8082,7 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, if (is_ssl()) { auto is_proxy_enabled = !proxy_host_.empty() && proxy_port_ != -1; if (!is_proxy_enabled) { - if (is_ssl_peer_could_be_closed(socket_.ssl)) { + if (detail::is_ssl_peer_could_be_closed(socket_.ssl, socket_.sock)) { error = Error::SSLPeerCouldBeClosed_; return false; } @@ -8166,12 +8207,14 @@ inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider( }; } -inline bool -ClientImpl::process_socket(const Socket &socket, - std::function callback) { +inline bool ClientImpl::process_socket( + const Socket &socket, + std::chrono::time_point start_time, + std::function callback) { return detail::process_client_socket( socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, - write_timeout_usec_, std::move(callback)); + write_timeout_usec_, max_timeout_msec_, start_time, + std::move(callback)); } inline bool ClientImpl::is_ssl() const { return false; } @@ -8195,6 +8238,9 @@ inline Result ClientImpl::Get(const std::string &path, const Headers &headers, req.path = path; req.headers = headers; req.progress = std::move(progress); + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } return send_(std::move(req)); } @@ -8260,6 +8306,9 @@ inline Result ClientImpl::Get(const std::string &path, const Headers &headers, return content_receiver(data, data_length); }; req.progress = std::move(progress); + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } return send_(std::move(req)); } @@ -8305,6 +8354,9 @@ inline Result ClientImpl::Head(const std::string &path, req.method = "HEAD"; req.headers = headers; req.path = path; + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } return send_(std::move(req)); } @@ -8722,6 +8774,9 @@ inline Result ClientImpl::Delete(const std::string &path, req.headers = headers; req.path = path; req.progress = progress; + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } if (!content_type.empty()) { req.set_header("Content-Type", content_type); } req.body.assign(body, content_length); @@ -8769,6 +8824,9 @@ inline Result ClientImpl::Options(const std::string &path, req.method = "OPTIONS"; req.headers = headers; req.path = path; + if (max_timeout_msec_ > 0) { + req.start_time_ = std::chrono::steady_clock::now(); + } return send_(std::move(req)); } @@ -8822,6 +8880,10 @@ inline void ClientImpl::set_write_timeout(time_t sec, time_t usec) { write_timeout_usec_ = usec; } +inline void ClientImpl::set_max_timeout(time_t msec) { + max_timeout_msec_ = msec; +} + inline void ClientImpl::set_basic_auth(const std::string &username, const std::string &password) { basic_auth_username_ = username; @@ -9004,11 +9066,7 @@ inline void ssl_delete(std::mutex &ctx_mutex, SSL *ssl, socket_t sock, (void)(sock); SSL_shutdown(ssl); #else - timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - reinterpret_cast(&tv), sizeof(tv)); + detail::set_socket_opt_time(sock, SOL_SOCKET, SO_RCVTIMEO, 1, 0); auto ret = SSL_shutdown(ssl); while (ret == 0) { @@ -9060,12 +9118,14 @@ inline bool process_server_socket_ssl( } template -inline bool -process_client_socket_ssl(SSL *ssl, socket_t sock, time_t read_timeout_sec, - time_t read_timeout_usec, time_t write_timeout_sec, - time_t write_timeout_usec, T callback) { +inline bool process_client_socket_ssl( + SSL *ssl, socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec, + std::chrono::time_point start_time, T callback) { SSLSocketStream strm(sock, ssl, read_timeout_sec, read_timeout_usec, - write_timeout_sec, write_timeout_usec); + write_timeout_sec, write_timeout_usec, + max_timeout_msec, start_time); return callback(strm); } @@ -9078,27 +9138,37 @@ class SSLInit { }; // SSL socket stream implementation -inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL *ssl, - time_t read_timeout_sec, - time_t read_timeout_usec, - time_t write_timeout_sec, - time_t write_timeout_usec) +inline SSLSocketStream::SSLSocketStream( + socket_t sock, SSL *ssl, time_t read_timeout_sec, time_t read_timeout_usec, + time_t write_timeout_sec, time_t write_timeout_usec, + time_t max_timeout_msec, + std::chrono::time_point start_time) : sock_(sock), ssl_(ssl), read_timeout_sec_(read_timeout_sec), read_timeout_usec_(read_timeout_usec), write_timeout_sec_(write_timeout_sec), - write_timeout_usec_(write_timeout_usec) { + write_timeout_usec_(write_timeout_usec), + max_timeout_msec_(max_timeout_msec), start_time(start_time) { SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY); } inline SSLSocketStream::~SSLSocketStream() = default; inline bool SSLSocketStream::is_readable() const { - return detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; + if (max_timeout_msec_ <= 0) { + return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; + } + + time_t read_timeout_sec; + time_t read_timeout_usec; + calc_actual_timeout(max_timeout_msec_, duration(), read_timeout_sec_, + read_timeout_usec_, read_timeout_sec, read_timeout_usec); + + return select_read(sock_, read_timeout_sec, read_timeout_usec) > 0; } inline bool SSLSocketStream::is_writable() const { return select_write(sock_, write_timeout_sec_, write_timeout_usec_) > 0 && - is_socket_alive(sock_); + is_socket_alive(sock_) && !is_ssl_peer_could_be_closed(ssl_, sock_); } inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { @@ -9129,8 +9199,9 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { } } return ret; + } else { + return -1; } - return -1; } inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) { @@ -9176,6 +9247,12 @@ inline void SSLSocketStream::get_local_ip_and_port(std::string &ip, inline socket_t SSLSocketStream::socket() const { return sock_; } +inline time_t SSLSocketStream::duration() const { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); +} + static SSLInit sslinit_; } // namespace detail @@ -9416,16 +9493,22 @@ inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) { } // Assumes that socket_mutex_ is locked and that there are no requests in flight -inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, - bool &success, Error &error) { +inline bool SSLClient::connect_with_proxy( + Socket &socket, + std::chrono::time_point start_time, + Response &res, bool &success, Error &error) { success = true; Response proxy_res; if (!detail::process_client_socket( socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) { + write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, + start_time, [&](Stream &strm) { Request req2; req2.method = "CONNECT"; req2.path = host_and_port_; + if (max_timeout_msec_ > 0) { + req2.start_time_ = std::chrono::steady_clock::now(); + } return process_request(strm, req2, proxy_res, false, error); })) { // Thread-safe to close everything because we are assuming there are no @@ -9445,7 +9528,8 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, proxy_res = Response(); if (!detail::process_client_socket( socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) { + write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, + start_time, [&](Stream &strm) { Request req3; req3.method = "CONNECT"; req3.path = host_and_port_; @@ -9453,6 +9537,9 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, req3, auth, 1, detail::random_string(10), proxy_digest_auth_username_, proxy_digest_auth_password_, true)); + if (max_timeout_msec_ > 0) { + req3.start_time_ = std::chrono::steady_clock::now(); + } return process_request(strm, req3, proxy_res, false, error); })) { // Thread-safe to close everything because we are assuming there are @@ -9608,13 +9695,15 @@ inline void SSLClient::shutdown_ssl_impl(Socket &socket, assert(socket.ssl == nullptr); } -inline bool -SSLClient::process_socket(const Socket &socket, - std::function callback) { +inline bool SSLClient::process_socket( + const Socket &socket, + std::chrono::time_point start_time, + std::function callback) { assert(socket.ssl); return detail::process_client_socket_ssl( socket.ssl, socket.sock, read_timeout_sec_, read_timeout_usec_, - write_timeout_sec_, write_timeout_usec_, std::move(callback)); + write_timeout_sec_, write_timeout_usec_, max_timeout_msec_, start_time, + std::move(callback)); } inline bool SSLClient::is_ssl() const { return true; } diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 1925b334b463f..e6a22a4e3770f 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 71151183b81da..2306dc26fe431 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -42,7 +42,7 @@ enum stop_type { STOP_TYPE_LIMIT, }; -// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 +// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future @@ -274,7 +274,7 @@ struct server_task { params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); - params.speculative.n_min = std::max(params.speculative.n_min, 2); + params.speculative.n_min = std::max(params.speculative.n_min, 0); params.speculative.n_max = std::max(params.speculative.n_max, 0); // Use OpenAI API logprobs only if n_probs wasn't provided @@ -329,9 +329,6 @@ struct server_task { } // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { - throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); - } if (data.contains("json_schema") && !data.contains("grammar")) { try { auto schema = json_value(data, "json_schema", json::object()); @@ -1807,7 +1804,7 @@ struct server_context { // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; - common_chat_templates chat_templates; + common_chat_templates_ptr chat_templates; ~server_context() { // Clear any sampling context @@ -1891,45 +1888,17 @@ struct server_context { llama_init_dft.context.reset(); } - if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) { + chat_templates = common_chat_templates_init(model, params_base.chat_template); + try { + common_chat_format_example(chat_templates.get(), params.use_jinja); + } catch (const std::exception & e) { SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); - chat_templates = common_chat_templates_from_model(model, "chatml"); - } else { - chat_templates = common_chat_templates_from_model(model, params_base.chat_template); + chat_templates = common_chat_templates_init(model, "chatml"); } - GGML_ASSERT(chat_templates.template_default.get() != nullptr); return true; } - bool validate_builtin_chat_template(bool use_jinja) const { - llama_chat_message chat[] = {{"user", "test"}}; - - if (use_jinja) { - auto templates = common_chat_templates_from_model(model, ""); - common_chat_inputs inputs; - inputs.messages = json::array({{ - {"role", "user"}, - {"content", "test"}, - }}); - GGML_ASSERT(templates.template_default); - try { - common_chat_params_init(*templates.template_default, inputs); - if (templates.template_tool_use) { - common_chat_params_init(*templates.template_tool_use, inputs); - } - return true; - } catch (const std::exception & e) { - SRV_ERR("failed to apply template: %s\n", e.what()); - return false; - } - } else { - const char * tmpl = llama_model_chat_template(model, /* name */ nullptr); - const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); - return chat_res > 0; - } - } - void init() { const int32_t n_ctx_slot = n_ctx / params_base.n_parallel; @@ -3656,7 +3625,7 @@ int main(int argc, char ** argv) { }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} + {"value", (float) res_metrics->n_busy_slots_total / std::max((float) res_metrics->n_decode_total, 1.f)} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, @@ -3822,13 +3791,15 @@ int main(int argc, char ** argv) { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, { "model_path", ctx_server.params_base.model }, - { "chat_template", ctx_server.chat_templates.template_default->source() }, - { "bos_token", ctx_server.chat_templates.template_default->bos_token() }, - { "eos_token", ctx_server.chat_templates.template_default->eos_token() }, + { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, + { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, + { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, { "build_info", build_info }, }; - if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) { - data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source(); + if (ctx_server.params_base.use_jinja) { + if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) { + data["chat_template_tool_use"] = tool_use_src; + } } res_ok(res, data); @@ -4063,7 +4034,7 @@ int main(int argc, char ** argv) { } auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); + json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get()); return handle_completions_impl( SERVER_TASK_TYPE_COMPLETION, @@ -4076,7 +4047,7 @@ int main(int argc, char ** argv) { // same with handle_chat_completions, but without inference part const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { auto body = json::parse(req.body); - json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates); + json data = oaicompat_completion_params_parse(body, params.use_jinja, params.reasoning_format, ctx_server.chat_templates.get()); res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); }; @@ -4263,6 +4234,11 @@ int main(int argc, char ** argv) { // return; //} + // if true, use TEI API format, otherwise use Jina API format + // Jina: https://jina.ai/reranker/ + // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank + bool is_tei_format = body.contains("texts"); + json query; if (body.count("query") == 1) { query = body.at("query"); @@ -4275,7 +4251,8 @@ int main(int argc, char ** argv) { return; } - std::vector documents = json_value(body, "documents", std::vector()); + std::vector documents = json_value(body, "documents", + json_value(body, "texts", std::vector())); if (documents.empty()) { res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST)); return; @@ -4320,7 +4297,12 @@ int main(int argc, char ** argv) { } // write JSON response - json root = format_response_rerank(body, responses); + json root = format_response_rerank( + body, + responses, + is_tei_format, + documents); + res_ok(res, root); }; @@ -4482,8 +4464,8 @@ int main(int argc, char ** argv) { // print sample chat example to make it clear which template is used LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__, - ctx_server.chat_templates.template_default->source().c_str(), - common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str()); + common_chat_templates_source(ctx_server.chat_templates.get()), + common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str()); ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) { ctx_server.process_single_task(task); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 1de0eb30e871e..12816bfa8705b 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -48,7 +48,7 @@ DEBUG=1 ./tests.sh -s -v -x To run all the tests in a file: ```shell -./tests.sh unit/test_chat_completion.py.py -v -x +./tests.sh unit/test_chat_completion.py -v -x ``` To run a single test: diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index f23d5cff49abc..af1dcb5b96554 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -21,6 +21,8 @@ def create_server(): (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None), + (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", False, None), + (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter", 79, 8, "length", True, None), ] ) def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template): @@ -44,7 +46,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte assert res.body["usage"]["completion_tokens"] == n_predicted choice = res.body["choices"][0] assert "assistant" == choice["message"]["role"] - assert match_regex(re_content, choice["message"]["content"]) + assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}' assert choice["finish_reason"] == finish_reason @@ -169,6 +171,47 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int assert "error" in res.body +@pytest.mark.parametrize("jinja,json_schema,n_predicted,re_content", [ + (False, {"const": "42"}, 6, "\"42\""), + (True, {"const": "42"}, 6, "\"42\""), +]) +def test_completion_with_json_schema(jinja: bool, json_schema: dict, n_predicted: int, re_content: str): + global server + server.jinja = jinja + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": n_predicted, + "messages": [ + {"role": "system", "content": "You are a coding assistant."}, + {"role": "user", "content": "Write an example"}, + ], + "json_schema": json_schema, + }) + assert res.status_code == 200, f'Expected 200, got {res.status_code}' + choice = res.body["choices"][0] + assert match_regex(re_content, choice["message"]["content"]), f'Expected {re_content}, got {choice["message"]["content"]}' + + +@pytest.mark.parametrize("jinja,grammar,n_predicted,re_content", [ + (False, 'root ::= "a"{5,5}', 6, "a{5,5}"), + (True, 'root ::= "a"{5,5}', 6, "a{5,5}"), +]) +def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re_content: str): + global server + server.jinja = jinja + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": n_predicted, + "messages": [ + {"role": "user", "content": "Does not matter what I say, does it?"}, + ], + "grammar": grammar, + }) + assert res.status_code == 200, res.body + choice = res.body["choices"][0] + assert match_regex(re_content, choice["message"]["content"]), choice["message"]["content"] + + @pytest.mark.parametrize("messages", [ None, "string", diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py index 7203d79435702..f4f570ad5ef78 100644 --- a/examples/server/tests/unit/test_rerank.py +++ b/examples/server/tests/unit/test_rerank.py @@ -10,17 +10,20 @@ def create_server(): server = ServerPreset.jina_reranker_tiny() +TEST_DOCUMENTS = [ + "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.", + "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.", + "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.", + "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine." +] + + def test_rerank(): global server server.start() res = server.make_request("POST", "/rerank", data={ "query": "Machine learning is", - "documents": [ - "A machine is a physical system that uses power to apply forces and control movement to perform an action. The term is commonly applied to artificial devices, such as those employing engines or motors, but also to natural biological macromolecules, such as molecular machines.", - "Learning is the process of acquiring new understanding, knowledge, behaviors, skills, values, attitudes, and preferences. The ability to learn is possessed by humans, non-human animals, and some machines; there is also evidence for some kind of learning in certain plants.", - "Machine learning is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.", - "Paris, capitale de la France, est une grande ville européenne et un centre mondial de l'art, de la mode, de la gastronomie et de la culture. Son paysage urbain du XIXe siècle est traversé par de larges boulevards et la Seine." - ] + "documents": TEST_DOCUMENTS, }) assert res.status_code == 200 assert len(res.body["results"]) == 4 @@ -38,6 +41,29 @@ def test_rerank(): assert least_relevant["index"] == 3 +def test_rerank_tei_format(): + global server + server.start() + res = server.make_request("POST", "/rerank", data={ + "query": "Machine learning is", + "texts": TEST_DOCUMENTS, + }) + assert res.status_code == 200 + assert len(res.body) == 4 + + most_relevant = res.body[0] + least_relevant = res.body[0] + for doc in res.body: + if doc["score"] > most_relevant["score"]: + most_relevant = doc + if doc["score"] < least_relevant["score"]: + least_relevant = doc + + assert most_relevant["score"] > least_relevant["score"] + assert most_relevant["index"] == 2 + assert least_relevant["index"] == 3 + + @pytest.mark.parametrize("documents", [ [], None, diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py index ba3367b4f332d..a91a2f3333ca3 100644 --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -356,12 +356,12 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | (None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - ("^> 0.56$", 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value) - ("^The y-coordinate [\\s\\S]*?\\*\\*0.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - ("[\\s\\S]*?\\*\\*0\\.5\\*\\*", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), + ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), ]) def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server @@ -401,7 +401,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, { "role": "tool", "name": "calculate", - "content": 0.55644242476, + "content": "0.55644242476", "tool_call_id": "call_6789" } ], @@ -444,7 +444,7 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, (128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - (1024, 'none', "\n?I need[\\s\\S]*?\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (1024, 'none', "^I need[\\s\\S]*?\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), (1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), ]) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 86de0e6d78977..4db4c783afd87 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -7,14 +7,14 @@ // increase max payload length to allow use of larger context size #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 +// disable Nagle's algorithm +#define CPPHTTPLIB_TCP_NODELAY true #include "httplib.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT #include "json.hpp" -#include "minja.hpp" -#include "chat.hpp" -#include "chat-template.hpp" +#include "chat.h" #include #include @@ -347,41 +347,6 @@ static llama_tokens format_infill( return embd_inp; } -// Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const common_chat_template & tmpl, const std::vector & messages) { - std::vector chat; - - for (size_t i = 0; i < messages.size(); ++i) { - const auto & curr_msg = messages[i]; - - std::string role = json_value(curr_msg, "role", std::string("")); - - std::string content; - if (curr_msg.contains("content")) { - if (curr_msg["content"].is_string()) { - content = curr_msg["content"].get(); - } else if (curr_msg["content"].is_array()) { - for (const auto & part : curr_msg["content"]) { - if (part.contains("text")) { - content += "\n" + part["text"].get(); - } - } - } else { - throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); - } - } else { - throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)"); - } - - chat.push_back({role, content, /* tool_calls= */ {}}); - } - - const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false); - LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str()); - - return formatted_chat; -} - // // base64 utils (TODO: move to common in the future) // @@ -579,12 +544,9 @@ static json oaicompat_completion_params_parse( const json & body, /* openai api json semantics */ bool use_jinja, common_reasoning_format reasoning_format, - const common_chat_templates & chat_templates) + const struct common_chat_templates * tmpls) { json llama_params; - const auto & tmpl = body.contains("tools") && chat_templates.template_tool_use - ? *chat_templates.template_tool_use - : *chat_templates.template_default; auto tools = json_value(body, "tools", json()); auto stream = json_value(body, "stream", false); @@ -610,62 +572,58 @@ static json oaicompat_completion_params_parse( llama_params["stop"] = json_value(body, "stop", json::array()); } + auto json_schema = json_value(body, "json_schema", json()); + auto grammar = json_value(body, "grammar", std::string()); + if (!json_schema.is_null() && !grammar.empty()) { + throw std::runtime_error("Cannot use both json_schema and grammar"); + } + // Handle "response_format" field if (body.contains("response_format")) { json response_format = json_value(body, "response_format", json::object()); std::string response_type = json_value(response_format, "type", std::string()); if (response_type == "json_object") { - llama_params["json_schema"] = json_value(response_format, "schema", json::object()); + json_schema = json_value(response_format, "schema", json::object()); } else if (response_type == "json_schema") { json json_schema = json_value(response_format, "json_schema", json::object()); - llama_params["json_schema"] = json_value(json_schema, "schema", json::object()); + json_schema = json_value(json_schema, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } } + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(body.at("messages")); + inputs.tools = common_chat_tools_parse_oaicompat(tools); + inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto"))); + inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump(); + inputs.grammar = grammar; + inputs.add_generation_prompt = true; + inputs.use_jinja = use_jinja; + inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE; + if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { + throw std::runtime_error("Cannot use custom grammar constraints with tools."); + } + // Apply chat template to the list of messages - if (use_jinja) { - auto tool_choice = json_value(body, "tool_choice", std::string("auto")); - if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") { - throw std::runtime_error("Invalid tool_choice: " + tool_choice); - } - if (tool_choice != "none" && llama_params.contains("grammar")) { - throw std::runtime_error("Cannot use custom grammar constraints with tools."); - } - common_chat_inputs inputs; - inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE; - inputs.messages = body.at("messages"); - inputs.tools = tools; - inputs.tool_choice = tool_choice; - inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); - if (inputs.parallel_tool_calls && !tmpl.original_caps().supports_parallel_tool_calls) { - LOG_DBG("Disabling parallel_tool_calls because the template does not support it\n"); - inputs.parallel_tool_calls = false; - } - inputs.stream = stream; - // TODO: support mixing schema w/ tools beyond generic format. - inputs.json_schema = json_value(llama_params, "json_schema", json()); - auto chat_params = common_chat_params_init(tmpl, inputs); - - llama_params["chat_format"] = static_cast(chat_params.format); - llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; - llama_params["grammar_lazy"] = chat_params.grammar_lazy; - auto grammar_triggers = json::array(); - for (const auto & trigger : chat_params.grammar_triggers) { - grammar_triggers.push_back({ - {"word", trigger.word}, - {"at_start", trigger.at_start}, - }); - } - llama_params["grammar_triggers"] = grammar_triggers; - llama_params["preserved_tokens"] = chat_params.preserved_tokens; - for (const auto & stop : chat_params.additional_stops) { - llama_params["stop"].push_back(stop); - } - } else { - llama_params["prompt"] = format_chat(tmpl, body.at("messages")); + auto chat_params = common_chat_templates_apply(tmpls, inputs); + + llama_params["chat_format"] = static_cast(chat_params.format); + llama_params["prompt"] = chat_params.prompt; + llama_params["grammar"] = chat_params.grammar; + llama_params["grammar_lazy"] = chat_params.grammar_lazy; + auto grammar_triggers = json::array(); + for (const auto & trigger : chat_params.grammar_triggers) { + grammar_triggers.push_back({ + {"word", trigger.word}, + {"at_start", trigger.at_start}, + }); + } + llama_params["grammar_triggers"] = grammar_triggers; + llama_params["preserved_tokens"] = chat_params.preserved_tokens; + for (const auto & stop : chat_params.additional_stops) { + llama_params["stop"].push_back(stop); } // Handle "n" field @@ -737,28 +695,50 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_response_rerank(const json & request, const json & ranks) { - json data = json::array(); - int32_t n_tokens = 0; - int i = 0; - for (const auto & rank : ranks) { - data.push_back(json{ - {"index", i++}, - {"relevance_score", json_value(rank, "score", 0.0)}, - }); +static json format_response_rerank( + const json & request, + const json & ranks, + bool is_tei_format, + std::vector & texts) { + json res; + if (is_tei_format) { + // TEI response format + res = json::array(); + bool return_text = json_value(request, "return_text", false); + for (const auto & rank : ranks) { + int index = json_value(rank, "index", 0); + json elem = json{ + {"index", index}, + {"score", json_value(rank, "score", 0.0)}, + }; + if (return_text) { + elem["text"] = std::move(texts[index]); + } + res.push_back(elem); + } + } else { + // Jina response format + json results = json::array(); + int32_t n_tokens = 0; + for (const auto & rank : ranks) { + results.push_back(json{ + {"index", json_value(rank, "index", 0)}, + {"relevance_score", json_value(rank, "score", 0.0)}, + }); - n_tokens += json_value(rank, "tokens_evaluated", 0); - } + n_tokens += json_value(rank, "tokens_evaluated", 0); + } - json res = json { - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", "list"}, - {"usage", json { - {"prompt_tokens", n_tokens}, - {"total_tokens", n_tokens} - }}, - {"results", data} - }; + res = json{ + {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", "list"}, + {"usage", json{ + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} + }}, + {"results", results} + }; + } return res; } diff --git a/examples/server/webui/src/components/ChatMessage.tsx b/examples/server/webui/src/components/ChatMessage.tsx index 68be7c751db54..40ea74711f349 100644 --- a/examples/server/webui/src/components/ChatMessage.tsx +++ b/examples/server/webui/src/components/ChatMessage.tsx @@ -159,6 +159,35 @@ export default function ChatMessage({
)} + + {msg.extra && msg.extra.length > 0 && ( +
+ + Extra content + +
+ {msg.extra.map( + (extra, i) => + extra.type === 'textFile' ? ( +
+ {extra.name} +
{extra.content}
+
+ ) : extra.type === 'context' ? ( +
+
{extra.content}
+
+ ) : null // TODO: support other extra types + )} +
+
+ )} + (null); + + const { extraContext, clearExtraContext } = useVSCodeContext( + inputRef, + setInputMsg + ); + // TODO: improve this when we have "upload file" feature + const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined; // keep track of leaf node for rendering const [currNodeId, setCurrNodeId] = useState(-1); @@ -115,10 +124,20 @@ export default function ChatScreen() { setCurrNodeId(-1); // get the last message node const lastMsgNodeId = messages.at(-1)?.msg.id ?? null; - if (!(await sendMessage(currConvId, lastMsgNodeId, inputMsg, onChunk))) { + if ( + !(await sendMessage( + currConvId, + lastMsgNodeId, + inputMsg, + currExtra, + onChunk + )) + ) { // restore the input message if failed setInputMsg(lastInpMsg); } + // OK + clearExtraContext(); }; const handleEditMessage = async (msg: Message, content: string) => { @@ -129,6 +148,7 @@ export default function ChatScreen() { viewingChat.conv.id, msg.parent, content, + msg.extra, onChunk ); setCurrNodeId(-1); @@ -143,6 +163,7 @@ export default function ChatScreen() { viewingChat.conv.id, msg.parent, null, + msg.extra, onChunk ); setCurrNodeId(-1); @@ -203,9 +224,11 @@ export default function ChatScreen() {