diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml deleted file mode 100644 index b85bf5741e5a3..0000000000000 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: Bug (compilation) -description: Something goes wrong when trying to compile llama.cpp. -title: "Compile bug: " -labels: ["bug-unconfirmed", "compilation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the compilation of llama.cpp fails. - Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. - If the compilation succeeds with ccache disabled you should be able to permanently fix the issue - by clearing `~/.cache/ccache` (on Linux). - - type: textarea - id: commit - attributes: - label: Git commit - description: Which commit are you trying to compile? - placeholder: | - $git rev-parse HEAD - 84a07a17b1b08cf2b9747c633a2372782848a27f - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. - placeholder: > - I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: command - attributes: - label: Compile command - description: > - Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml deleted file mode 100644 index 1ccef0793d45e..0000000000000 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ /dev/null @@ -1,101 +0,0 @@ -name: Bug (model use) -description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). -title: "Eval bug: " -labels: ["bug-unconfirmed", "model evaluation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the model evaluation results - (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - The `llama-cli` binary can be used for simple and reproducible model inference. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: hardware - attributes: - label: Hardware - description: Which CPUs/GPUs are you using? - placeholder: > - e.g. Ryzen 5950X + 2x RTX 4090 - validations: - required: true - - type: textarea - id: model - attributes: - label: Models - description: > - Which model(s) at which quantization were you using when encountering the bug? - If you downloaded a GGUF file off of Huggingface, please provide a link. - placeholder: > - e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific hardware, compile flags, or command line arguments, - that information would be very much appreciated by us. - placeholder: > - e.g. when I run llama-cli with -ngl 99 I get garbled outputs. - When I use -ngl 0 it works correctly. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml deleted file mode 100644 index 1904e31fdc436..0000000000000 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Bug (misc.) -description: Something is not working the way it should (and it's not covered by any of the above cases). -title: "Misc. bug: " -labels: ["bug-unconfirmed"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for miscellaneous bugs that don't fit into any other category. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software is affected? (You can use `--version` to get a version string.) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: dropdown - id: module - attributes: - label: Which llama.cpp modules do you know to be affected? - multiple: true - options: - - Documentation/Github - - libllama (core library) - - llama-cli - - llama-server - - llama-bench - - llama-quantize - - Python/Bash scripts - - Test code - - Other (Please specify in the next section) - validations: - required: false - - type: textarea - id: command - attributes: - label: Command line - description: > - Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it (if applicable). - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - If applicable, please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml deleted file mode 100644 index cee1446f5a097..0000000000000 --- a/.github/ISSUE_TEMPLATE/020-enhancement.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Enhancement -description: Used to request enhancements for llama.cpp. -title: "Feature Request: " -labels: ["enhancement"] -body: - - type: markdown - attributes: - value: | - [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas) - - - type: checkboxes - id: prerequisites - attributes: - label: Prerequisites - description: Please confirm the following before submitting your enhancement request. - options: - - label: I am running the latest code. Mention the version if possible as well. - required: true - - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md). - required: true - - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). - required: true - - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share. - required: true - - - type: textarea - id: feature-description - attributes: - label: Feature Description - description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. - placeholder: Detailed description of the enhancement - validations: - required: true - - - type: textarea - id: motivation - attributes: - label: Motivation - description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. - placeholder: Explanation of why this feature is needed and its benefits - validations: - required: true - - - type: textarea - id: possible-implementation - attributes: - label: Possible Implementation - description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. - placeholder: Detailed description of potential implementation - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml deleted file mode 100644 index e774550d5908c..0000000000000 --- a/.github/ISSUE_TEMPLATE/030-research.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: Research -description: Track new technical research area. -title: "Research: " -labels: ["research 🔬"] -body: - - type: markdown - attributes: - value: | - Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) - - - type: checkboxes - id: research-stage - attributes: - label: Research Stage - description: Track general state of this research ticket - options: - - label: Background Research (Let's try to avoid reinventing the wheel) - - label: Hypothesis Formed (How do you think this will work and it's effect?) - - label: Strategy / Implementation Forming - - label: Analysis of results - - label: Debrief / Documentation (So people in the future can learn from us) - - - type: textarea - id: background - attributes: - label: Previous existing literature and research - description: Whats the current state of the art and whats the motivation for this research? - - - type: textarea - id: hypothesis - attributes: - label: Hypothesis - description: How do you think this will work and it's effect? - - - type: textarea - id: implementation - attributes: - label: Implementation - description: Got an approach? e.g. a PR ready to go? - - - type: textarea - id: analysis - attributes: - label: Analysis - description: How does the proposed implementation behave? - - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml deleted file mode 100644 index 2fe94e26c6988..0000000000000 --- a/.github/ISSUE_TEMPLATE/040-refactor.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Refactor (Maintainers) -description: Used to track refactoring opportunities. -title: "Refactor: " -labels: ["refactor"] -body: - - type: markdown - attributes: - value: | - Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. - Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. - - - type: textarea - id: background-description - attributes: - label: Background Description - description: Please provide a detailed written description of the pain points you are trying to solve. - placeholder: Detailed description behind your motivation to request refactor - validations: - required: true - - - type: textarea - id: possible-approaches - attributes: - label: Possible Refactor Approaches - description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list. - placeholder: Your idea of possible refactoring opportunity/approaches - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml deleted file mode 100644 index 0d246533c9515..0000000000000 --- a/.github/ISSUE_TEMPLATE/config.yml +++ /dev/null @@ -1,11 +0,0 @@ -blank_issues_enabled: true -contact_links: - - name: Got an idea? - url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas - about: Pop it there. It may then become an enhancement ticket. - - name: Got a question? - url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a - about: Ask a question there! - - name: Want to contribute? - url: https://github.com/ggml-org/llama.cpp/wiki/contribute - about: Head to the contribution guide page of the wiki for areas you can help with diff --git a/.github/actions/get-tag-name/action.yml b/.github/actions/get-tag-name/action.yml deleted file mode 100644 index 7ace23b2a3e76..0000000000000 --- a/.github/actions/get-tag-name/action.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: "Determine tag name" -description: "Determine the tag name to use for a release" -outputs: - name: - description: "The name of the tag" - value: ${{ steps.tag.outputs.name }} - -runs: - using: "composite" - steps: - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml deleted file mode 100644 index 5575caeca31a2..0000000000000 --- a/.github/actions/windows-setup-cuda/action.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: "Windows - Setup CUDA Toolkit" -description: "Setup CUDA Toolkit for Windows" -inputs: - cuda_version: - description: "CUDA toolkit version" - required: true - -runs: - using: "composite" - steps: - - name: Install Cuda Toolkit 11.7 - if: ${{ inputs.cuda_version == '11.7' }} - shell: pwsh - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - - - name: Install Cuda Toolkit 12.4 - if: ${{ inputs.cuda_version == '12.4' }} - shell: pwsh - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml deleted file mode 100644 index 446f799fac34a..0000000000000 --- a/.github/actions/windows-setup-curl/action.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: 'Windows - Setup CURL' -description: 'Composite action, to be reused in other workflow' -inputs: - curl_version: - description: 'CURL version' - required: false - default: '8.6.0_6' - architecture: - description: 'Architecture of the libcurl to download' - required: false - default: 'win64' -outputs: - curl_path: - description: "Path to the downloaded libcurl" - value: ${{ steps.get_libcurl.outputs.curl_path }} - -runs: - using: "composite" - steps: - - name: libCURL - id: get_libcurl - shell: powershell - env: - CURL_VERSION: ${{ inputs.curl_version }} - ARCHITECTURE: ${{ inputs.architecture }} - run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip" - mkdir $env:RUNNER_TEMP/libcurl - tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl - echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT diff --git a/.github/labeler.yml b/.github/labeler.yml deleted file mode 100644 index 3c2f67707b024..0000000000000 --- a/.github/labeler.yml +++ /dev/null @@ -1,95 +0,0 @@ -# https://github.com/actions/labeler -Kompute: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute/** - - README-kompute.md -Apple Metal: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-metal.h - - ggml/src/ggml-metal/** - - README-metal.md -SYCL: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-sycl.h - - ggml/src/ggml-sycl/** - - docs/backend/SYCL.md - - examples/sycl/** -Nvidia GPU: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-cuda.h - - ggml/src/ggml-cuda/** -Vulkan: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-vulkan.h - - ggml/src/ggml-vulkan/** -documentation: - - changed-files: - - any-glob-to-any-file: - - docs/** - - media/** -testing: - - changed-files: - - any-glob-to-any-file: - - tests/** -build: - - changed-files: - - any-glob-to-any-file: - - cmake/** - - CMakeLists.txt - - CMakePresets.json -examples: - - changed-files: - - any-glob-to-any-file: - - examples/** - - tools/** -devops: - - changed-files: - - any-glob-to-any-file: - - .devops/** - - .github/** - - ci/** -python: - - changed-files: - - any-glob-to-any-file: - - "**/*.py" - - requirements/** - - gguf-py/** - - .flake8 -script: - - changed-files: - - any-glob-to-any-file: - - scripts/** -android: - - changed-files: - - any-glob-to-any-file: - - examples/llama.android/** -server: - - changed-files: - - any-glob-to-any-file: - - tools/server/** -ggml: - - changed-files: - - any-glob-to-any-file: - - ggml/** -nix: - - changed-files: - - any-glob-to-any-file: - - "**/*.nix" - - .github/workflows/nix-*.yml - - .devops/nix/nixpkgs-instances.nix -embedding: - - changed-files: - - any-glob-to-any-file: examples/embedding/ - -Ascend NPU: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-cann.h - - ggml/src/ggml-cann/** - - docs/backend/CANN.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index d0bdd73c4439c..0000000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1 +0,0 @@ -*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled deleted file mode 100644 index f2d7e16e981ac..0000000000000 --- a/.github/workflows/bench.yml.disabled +++ /dev/null @@ -1,304 +0,0 @@ -# TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggml-org/llama.cpp/issues/7893 -# -# Benchmark -name: Benchmark - -on: - workflow_dispatch: - inputs: - gpu-series: - description: 'Azure GPU series to run with' - required: true - type: choice - options: - - Standard_NC4as_T4_v3 - - Standard_NC24ads_A100_v4 - - Standard_NC80adis_H100_v5 - sha: - description: 'Commit SHA1 to build' - required: false - type: string - duration: - description: 'Duration of the bench' - type: string - default: 10m - - push: - branches: - - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp'] - pull_request_target: - types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp'] - schedule: - - cron: '04 2 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} - cancel-in-progress: true - -jobs: - bench-server-baseline: - runs-on: Standard_NC4as_T4_v3 - env: - RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it - N_USERS: 8 - DURATION: 10m - - strategy: - matrix: - model: [phi-2] - ftype: [q4_0, q8_0, f16] - include: - - model: phi-2 - ftype: q4_0 - pr_comment_enabled: "true" - - if: | - inputs.gpu-series == 'Standard_NC4as_T4_v3' - || github.event_name == 'pull_request_target' - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Install python env - id: pipenv - run: | - cd tools/server/bench - python3 -m venv venv - source venv/bin/activate - pip install -r requirements.txt - - - name: Prometheus - id: install_prometheus - run: | - wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz - tar xzf prometheus*.tar.gz --strip-components=1 - ./prometheus --config.file=tools/server/bench/prometheus.yml & - while ! nc -z localhost 9090; do - sleep 0.1 - done - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.21' - - - name: Install k6 and xk6-sse - id: k6_installation - run: | - cd tools/server/bench - go install go.k6.io/xk6/cmd/xk6@latest - xk6 build master \ - --with github.com/phymbert/xk6-sse - - - name: Build - id: cmake_build - run: | - set -eux - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CUBLAS=ON \ - -DCUDAToolkit_ROOT=/usr/local/cuda \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ - -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DLLAMA_FATAL_WARNINGS=OFF \ - -DLLAMA_ALL_WARNINGS=OFF \ - -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target llama-server - - - name: Download the dataset - id: download_dataset - run: | - cd tools/server/bench - wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - name: Server bench - id: server_bench - env: - HEAD_REF: ${{ github.head_ref || github.ref_name }} - run: | - set -eux - - cd tools/server/bench - source venv/bin/activate - python bench.py \ - --runner-label ${{ env.RUNNER_LABEL }} \ - --name ${{ github.job }} \ - --branch $HEAD_REF \ - --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ - --scenario script.js \ - --duration ${{ github.event.inputs.duration || env.DURATION }} \ - --hf-repo ggml-org/models \ - --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ - --model-path-prefix /models \ - --parallel ${{ env.N_USERS }} \ - -ngl 33 \ - --batch-size 2048 \ - --ubatch-size 256 \ - --ctx-size 16384 \ - --n-prompts 1000 \ - --max-prompt-tokens 1024 \ - --max-tokens 2048 - - cat results.github.env >> $GITHUB_ENV - - # Remove dataset as we do not want it in the artefact - rm ShareGPT_V3_unfiltered_cleaned_split.json - - - uses: actions/upload-artifact@v4 - with: - name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - compression-level: 9 - path: | - tools/server/bench/*.jpg - tools/server/bench/*.json - tools/server/bench/*.log - - - name: Commit status - uses: Sibz/github-status-action@v1 - with: - authToken: ${{secrets.GITHUB_TOKEN}} - sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} - context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - description: | - ${{ env.BENCH_RESULTS }} - state: 'success' - - - name: Upload benchmark images - uses: devicons/public-upload-to-imgur@v2.2.2 - continue-on-error: true # Important as it looks unstable: 503 - id: imgur_step - with: - client_id: ${{secrets.IMGUR_CLIENT_ID}} - path: | - tools/server/bench/prompt_tokens_seconds.jpg - tools/server/bench/predicted_tokens_seconds.jpg - tools/server/bench/kv_cache_usage_ratio.jpg - tools/server/bench/requests_processing.jpg - - - name: Extract mermaid - id: set_mermaid - run: | - set -eux - - cd tools/server/bench - PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) - echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) - echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) - echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV - echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - REQUESTS_PROCESSING=$(cat requests_processing.mermaid) - echo "REQUESTS_PROCESSING<> $GITHUB_ENV - echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Extract image url - id: extract_image_url - continue-on-error: true - run: | - set -eux - - echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV - echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV - echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV - echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV - - - name: Comment PR - uses: mshick/add-pr-comment@v2 - id: comment_pr - if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} - with: - message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - message: | -

- - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - -

- -
- - Expand details for performance related PR only - - - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - - ${{ env.BENCH_GRAPH_XLABEL }} - - -

- - prompt_tokens_seconds - -

- - More - - ```mermaid - ${{ env.PROMPT_TOKENS_SECONDS }} - ``` - -
- - predicted_tokens_seconds - -
- More - - ```mermaid - ${{ env.PREDICTED_TOKENS_SECONDS }} - ``` - -
- -

- -
- - Details - -

- - kv_cache_usage_ratio - -

- More - - ```mermaid - ${{ env.KV_CACHE_USAGE_RATIO }} - ``` - -
- - requests_processing - -
- More - - ```mermaid - ${{ env.REQUESTS_PROCESSING }} - ``` - -
- -

-
-
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml deleted file mode 100644 index 7cfc82ba4e277..0000000000000 --- a/.github/workflows/build-linux-cross.yml +++ /dev/null @@ -1,346 +0,0 @@ -name: Build on Linux using cross-compiler -on: - workflow_dispatch: - workflow_call: - -jobs: - ubuntu-24-riscv64-cpu-cross: - runs-on: ubuntu-24.04 - - steps: - - uses: actions/checkout@v4 - - name: Setup Riscv - run: | - sudo dpkg --add-architecture riscv64 - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF - - sudo apt-get update || true ;# Prevent failure due to missing URLs. - - sudo apt-get install -y --no-install-recommends \ - build-essential \ - gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-24-riscv64-vulkan-cross: - runs-on: ubuntu-24.04 - - steps: - - uses: actions/checkout@v4 - - name: Setup Riscv - run: | - sudo dpkg --add-architecture riscv64 - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF - - sudo apt-get update || true ;# Prevent failure due to missing URLs. - - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu \ - libvulkan-dev:riscv64 - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-24-arm64-vulkan-cross: - runs-on: ubuntu-24.04 - - steps: - - uses: actions/checkout@v4 - - name: Setup Arm64 - run: | - sudo dpkg --add-architecture arm64 - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF - - sudo apt-get update || true ;# Prevent failure due to missing URLs. - - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - crossbuild-essential-arm64 \ - libvulkan-dev:arm64 - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ - -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ - -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-24-ppc64el-cpu-cross: - runs-on: ubuntu-24.04 - - steps: - - uses: actions/checkout@v4 - - name: Setup PowerPC64le - run: | - sudo dpkg --add-architecture ppc64el - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF - - sudo apt-get update || true ;# Prevent failure due to missing URLs. - - sudo apt-get install -y --no-install-recommends \ - build-essential \ - gcc-14-powerpc64le-linux-gnu \ - g++-14-powerpc64le-linux-gnu - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ - -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-24-ppc64el-vulkan-cross: - runs-on: ubuntu-24.04 - - steps: - - uses: actions/checkout@v4 - - name: Setup PowerPC64le - run: | - sudo dpkg --add-architecture ppc64el - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe - deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe - EOF - - sudo apt-get update || true ;# Prevent failure due to missing URLs. - - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-powerpc64le-linux-gnu \ - g++-14-powerpc64le-linux-gnu \ - libvulkan-dev:ppc64el - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=ppc64 \ - -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - debian-13-loongarch64-cpu-cross: - runs-on: ubuntu-24.04 - container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 - - steps: - - uses: actions/checkout@v4 - - name: Setup LoongArch - run: | - rm -f /etc/apt/sources.list.d/* - cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list - deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main - EOF - ( echo 'quiet "true";'; \ - echo 'APT::Get::Assume-Yes "true";'; \ - echo 'APT::Install-Recommends "false";'; \ - echo 'Acquire::Check-Valid-Until "false";'; \ - echo 'Acquire::Retries "5";'; \ - ) > /etc/apt/apt.conf.d/99snapshot-repos - - apt-get update - apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip - dpkg --add-architecture loong64 - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list - deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main - EOF - - apt-get update || true ;# Prevent failure due to missing URLs. - - apt-get install -y --no-install-recommends \ - build-essential \ - gcc-14-loongarch64-linux-gnu \ - g++-14-loongarch64-linux-gnu - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \ - -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - debian-13-loongarch64-vulkan-cross: - runs-on: ubuntu-24.04 - container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 - - steps: - - uses: actions/checkout@v4 - - name: Setup LoongArch - run: | - rm -f /etc/apt/sources.list.d/* - cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list - deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main - EOF - ( echo 'quiet "true";'; \ - echo 'APT::Get::Assume-Yes "true";'; \ - echo 'APT::Install-Recommends "false";'; \ - echo 'Acquire::Check-Valid-Until "false";'; \ - echo 'Acquire::Retries "5";'; \ - ) > /etc/apt/apt.conf.d/99snapshot-repos - - apt-get update - apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip - dpkg --add-architecture loong64 - - # Add arch-specific repositories for non-amd64 architectures - cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list - deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main - EOF - - apt-get update || true ;# Prevent failure due to missing URLs. - - apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-loongarch64-linux-gnu \ - g++-14-loongarch64-linux-gnu \ - libvulkan-dev:loong64 - - - name: Build - run: | - cmake -B build -DLLAMA_CURL=OFF \ - -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TOOLS=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \ - -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index c4783a6df8882..0000000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,1080 +0,0 @@ -name: CI - -on: - workflow_dispatch: # allows manual triggering - push: - branches: - - master - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -env: - GGML_NLOOP: 3 - GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - -jobs: - macOS-latest-cmake-arm64: - runs-on: macos-14 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-arm64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - macOS-latest-cmake-x64: - runs-on: macos-13 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-x64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL=OFF \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-cpu-cmake: - strategy: - matrix: - include: - - build: 'x64' - os: ubuntu-22.04 - - build: 'arm64' - os: ubuntu-22.04-arm - - runs-on: ${{ matrix.os }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-cpu-cmake - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }} - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=OFF - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-llguidance: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_LLGUIDANCE=ON - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-cmake-rpc: - runs-on: ubuntu-latest - - continue-on-error: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-rpc - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ubuntu-22-cmake-vulkan: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-vulkan - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list - sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_VULKAN=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - export GGML_VK_VISIBLE_DEVICES=0 - # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 3600 - - ubuntu-22-cmake-hip: - runs-on: ubuntu-22.04 - container: rocm/dev-ubuntu-22.04:6.0.2 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-hip - evict-old-files: 1d - - - name: Build with native CMake HIP support - id: cmake_build - run: | - cmake -B build -S . \ - -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ - -DGGML_HIP_ROCWMMA_FATTN=ON \ - -DGGML_HIP=ON - cmake --build build --config Release -j $(nproc) - - - name: Build with legacy HIP support - id: cmake_build_legacy_hip - run: | - cmake -B build2 -S . \ - -DCMAKE_C_COMPILER=hipcc \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DGGML_HIP_ROCWMMA_FATTN=ON \ - -DGGML_HIP=ON - cmake --build build2 --config Release -j $(nproc) - - ubuntu-22-cmake-musa: - runs-on: ubuntu-22.04 - container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - apt-get update - apt-get install -y build-essential git cmake libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-musa - evict-old-files: 1d - - - name: Build with native CMake MUSA support - id: cmake_build - run: | - cmake -B build -S . \ - -DGGML_MUSA=ON - cmake --build build --config Release -j $(nproc) - - ubuntu-22-cmake-sycl: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl - evict-old-files: 1d - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx - cmake --build build --config Release -j $(nproc) - - ubuntu-22-cmake-sycl-fp16: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl-fp16 - evict-old-files: 1d - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DGGML_SYCL_F16=ON - cmake --build build --config Release -j $(nproc) - - build-linux-cross: - uses: ./.github/workflows/build-linux-cross.yml - - macOS-latest-cmake-ios: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-ios - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-tvos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-tvos - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-visionos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-swift: - runs-on: macos-latest - - strategy: - matrix: - destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-swift - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build llama.cpp with CMake - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - - windows-msys2: - runs-on: windows-latest - - strategy: - fail-fast: false - matrix: - include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-msys2 - variant: ccache - evict-old-files: 1d - - - name: Setup ${{ matrix.sys }} - uses: msys2/setup-msys2@v2 - with: - update: true - msystem: ${{matrix.sys}} - install: >- - base-devel - git - mingw-w64-${{matrix.env}}-toolchain - mingw-w64-${{matrix.env}}-cmake - mingw-w64-${{matrix.env}}-openblas - - - name: Build using CMake - shell: msys2 {0} - run: | - cmake -B build - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - - name: Clean after building using CMake - shell: msys2 {0} - run: | - rm -rf build - - - name: Build using CMake w/ OpenBLAS - shell: msys2 {0} - run: | - cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - windows-latest-cmake: - runs-on: windows-latest - - env: - OPENBLAS_VERSION: 0.3.23 - SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.4.309.0 - - strategy: - matrix: - include: - - build: 'cpu-x64 (static)' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF' - - build: 'openblas-x64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - - build: 'vulkan-x64' - defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - - build: 'llvm-arm64-opencl-adreno' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' - # - build: 'kompute-x64' - # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-${{ matrix.build }} - variant: ccache - evict-old-files: 1d - - - name: Clone Kompute submodule - id: clone_kompute - if: ${{ matrix.build == 'kompute-x64' }} - run: | - git submodule update --init ggml/src/ggml-kompute/kompute - - - name: Download OpenBLAS - id: get_openblas - if: ${{ matrix.build == 'openblas-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" - curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/openblas - tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') - & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll - - - name: Install Vulkan SDK - id: get_vulkan - if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" - & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install - Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" - Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} - run: | - git clone https://github.com/KhronosGroup/OpenCL-Headers - cd OpenCL-Headers - cmake -B build ` - -DBUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build --target install - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader - cd OpenCL-ICD-Loader - cmake -B build-arm64-release ` - -A arm64 ` - -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build-arm64-release --target install --config release - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cmake -S . -B build ${{ matrix.defines }} ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release - - - name: Add libopenblas.dll - id: add_libopenblas_dll - if: ${{ matrix.build == 'openblas-x64' }} - run: | - cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll - cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt - - - name: Test - id: cmake_test - if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }} - run: | - cd build - ctest -L main -C Release --verbose --timeout 900 - - # TODO: disabled for now, consider adding tests for all CPU variants instead - # - name: Test (Intel SDE) - # id: cmake_test_sde - # if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation - # run: | - # curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" - # # for some weird reason windows tar doesn't like sde tar.xz - # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz - # 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar - # $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) - # cd build - # $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 - # & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - - ubuntu-latest-cmake-cuda: - runs-on: ubuntu-latest - container: nvidia/cuda:12.6.2-devel-ubuntu24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Install dependencies - env: - DEBIAN_FRONTEND: noninteractive - run: | - apt update - apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-cuda - evict-old-files: 1d - - - name: Build with CMake - run: | - cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES=89-real \ - -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CUDA=ON - cmake --build build - - windows-2022-cmake-cuda: - runs-on: windows-2022 - - strategy: - matrix: - cuda: ['12.4'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-cuda-${{ matrix.cuda }} - variant: ccache - evict-old-files: 1d - - - name: Install Cuda Toolkit - uses: ./.github/actions/windows-setup-cuda - with: - cuda_version: ${{ matrix.cuda }} - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - cmake -S . -B build -G "Ninja Multi-Config" ^ - -DLLAMA_BUILD_SERVER=ON ^ - -DGGML_NATIVE=OFF ^ - -DGGML_BACKEND_DL=ON ^ - -DGGML_CPU_ALL_VARIANTS=ON ^ - -DGGML_CUDA=ON ^ - -DGGML_RPC=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" - set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% -t ggml - cmake --build build --config Release - - windows-latest-cmake-sycl: - runs-on: windows-latest - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-sycl - variant: ccache - evict-old-files: 1d - - - name: Install - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - - - name: Build - id: cmake_build - run: examples/sycl/win-build-sycl.bat - - windows-latest-cmake-hip: - if: ${{ github.event.inputs.create_release != 'true' }} - runs-on: windows-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Clone rocWMMA repository - id: clone_rocwmma - run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ${{ github.job }} - evict-old-files: 1d - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` - -DCMAKE_BUILD_TYPE=Release ` - -DGGML_HIP=ON ` - -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_RPC=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - - ios-xcode-build: - runs-on: macos-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - - - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build - - android-build: - runs-on: ubuntu-latest - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: android-build - evict-old-files: 1d - - - name: Set up JDK - uses: actions/setup-java@v3 - with: - java-version: 17 - distribution: zulu - - - name: Setup Android SDK - uses: android-actions/setup-android@v3 - with: - log-accepted-android-sdk-licenses: false - - - name: Build - run: | - cd examples/llama.android - ./gradlew build --no-daemon - - openEuler-latest-cmake-cann: - if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }} - defaults: - run: - shell: bash -el {0} - strategy: - matrix: - arch: [x86, aarch64] - cann: - - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10' - device: - - 'ascend910b3' - build: - - 'Release' - runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }} - container: ascendai/cann:${{ matrix.cann }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dependencies - run: | - yum update -y - yum install -y git gcc gcc-c++ make cmake libcurl-devel - - - name: Build - run: | - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${{ matrix.device }} - cmake --build build -j $(nproc) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml deleted file mode 100644 index 276a217d45005..0000000000000 --- a/.github/workflows/close-issue.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Close inactive issues -on: - schedule: - - cron: "42 0 * * *" - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - issues: write - -jobs: - close-issues: - runs-on: ubuntu-latest - permissions: - issues: write - pull-requests: write - steps: - - uses: actions/stale@v5 - with: - exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap" - days-before-issue-stale: 30 - days-before-issue-close: 14 - stale-issue-label: "stale" - close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." - days-before-pr-stale: -1 - days-before-pr-close: -1 - operations-per-run: 10000 - repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index 2067927be56ca..0000000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,178 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -# GitHub recommends pinning actions to a commit SHA. -# To get a newer version, you will need to update the SHA. -# You can also reference a tag or branch, but the action may change without warning. - -name: Publish Docker image - -on: - workflow_dispatch: # allows manual triggering - schedule: - # Rebuild daily rather than on every push because it is expensive - - cron: '12 4 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - packages: write - -jobs: - push_to_registry: - name: Push Docker image to Docker Hub - - runs-on: ubuntu-22.04 - env: - COMMIT_SHA: ${{ github.sha }} - strategy: - fail-fast: false - matrix: - config: - # Multi-stage build - # Note: the arm64 images are failing, which prevents the amd64 images from being built - # https://github.com/ggml-org/llama.cpp/issues/11888 - #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false } - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true } - steps: - - name: Check out the repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 # preserve git history, so we can determine the build number - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - image: tonistiigi/binfmt:qemu-v7.0.0-28 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case - REPO_NAME="${{ github.event.repository.name }}" - - # determine tag name postfix (build number, commit hash) - if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="-b${BUILD_NUMBER}" - else - SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" - fi - # list all tags possible - if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then - TYPE="" - else - TYPE="-${{ matrix.config.tag }}" - fi - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" - LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" - SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" - echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT - echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT - echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT - echo "full_output_tags=$FULLTAGS" # print out for debugging - echo "light_output_tags=$LIGHTTAGS" # print out for debugging - echo "server_output_tags=$SERVERTAGS" # print out for debugging - env: - GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - - name: Free Disk Space (Ubuntu) - if: ${{ matrix.config.free_disk_space == true }} - uses: ggml-org/free-disk-space@v1.3.1 - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - - name: Build and push Full Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.full_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: full - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - - - name: Build and push Light Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.light_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: light - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - - - name: Build and push Server Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.server_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: server - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index f02b7c2194bcf..0000000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: EditorConfig Checker - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - pull_request: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@v2 - with: - version: v3.0.3 - - run: editorconfig-checker diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml deleted file mode 100644 index 3ca4d30581074..0000000000000 --- a/.github/workflows/gguf-publish.yml +++ /dev/null @@ -1,44 +0,0 @@ -# This workflow will upload a Python Package using Twine when a GGUF release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -# See `gguf-py/README.md` for how to make a release. - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package - -on: - workflow_dispatch: - push: - # Pattern matched against refs/tags - tags: - - 'gguf-v*' # Push events to every version tag - - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.9.x' - - name: Install dependencies - run: | - cd gguf-py - python -m pip install poetry - poetry install - - - name: Build package - run: cd gguf-py && poetry build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - packages-dir: gguf-py/dist diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index 0b0f300aa402a..0000000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: "Pull Request Labeler" -on: -- pull_request_target - -jobs: - labeler: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - repository: "ggml-org/llama.cpp" - - uses: actions/labeler@v5 - with: - configuration-path: '.github/labeler.yml' diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml deleted file mode 100644 index 46e80aecd0a0c..0000000000000 --- a/.github/workflows/python-check-requirements.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Python check requirements.txt - -on: - push: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - '**/requirements*.txt' - pull_request: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - '**/requirements*.txt' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - python-check-requirements: - runs-on: ubuntu-latest - name: check-requirements - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml deleted file mode 100644 index ddfdf73b8fce2..0000000000000 --- a/.github/workflows/python-lint.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: flake8 Lint - -on: - push: - branches: - - master - paths: ['.github/workflows/python-lint.yml', '**/*.py'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/python-lint.yml', '**/*.py'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - flake8-lint: - runs-on: ubuntu-latest - name: Lint - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: flake8 Lint - uses: py-actions/flake8@v2 - with: - plugins: "flake8-no-print" diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml deleted file mode 100644 index 373bb601020b2..0000000000000 --- a/.github/workflows/python-type-check.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Python Type-Check - -on: - push: - paths: - - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - - '**.py' - - '**/requirements*.txt' - pull_request: - paths: - - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - - '**.py' - - '**/requirements*.txt' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - python-type-check: - runs-on: ubuntu-latest - name: pyright type-check - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Install Python dependencies - # TODO: use a venv - run: pip install -r requirements/requirements-all.txt - - name: Type-check with Pyright - uses: jakebailey/pyright-action@v2 - with: - version: 1.1.382 - level: warning - warnings: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 9874736cbd8de..0000000000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,749 +0,0 @@ -name: Release - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON" - -jobs: - macOS-arm64: - runs-on: macos-14 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-arm64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DGGML_RPC=ON \ - ${{ env.CMAKE_ARGS }} - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip - name: llama-bin-macos-arm64.zip - - macOS-x64: - runs-on: macos-13 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-x64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL=OFF \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip - name: llama-bin-macos-x64.zip - - ubuntu-22-cpu: - strategy: - matrix: - include: - - build: 'x64' - os: ubuntu-22.04 - # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm - # - build: 'arm64' - # os: ubuntu-22.04-arm - - runs-on: ${{ matrix.os }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-cpu-cmake - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_BACKEND_DL=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DLLAMA_FATAL_WARNINGS=ON \ - ${{ env.CMAKE_ARGS }} - cmake --build build --config Release -j $(nproc) - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip - name: llama-bin-ubuntu-${{ matrix.build }}.zip - - ubuntu-22-vulkan: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-vulkan - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list - sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_BACKEND_DL=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DGGML_VULKAN=ON \ - ${{ env.CMAKE_ARGS }} - cmake --build build --config Release -j $(nproc) - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip - name: llama-bin-ubuntu-vulkan-x64.zip - - windows-cpu: - runs-on: windows-latest - - strategy: - matrix: - include: - - arch: 'x64' - - arch: 'arm64' - - steps: - - name: Clone - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-cpu-${{ matrix.arch }} - variant: ccache - evict-old-files: 1d - - - name: Install Ninja - run: | - choco install ninja - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - with: - architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - - - name: Build - shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }} - cmake -S . -B build -G "Ninja Multi-Config" ^ - -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^ - -DGGML_NATIVE=OFF ^ - -DGGML_BACKEND_DL=ON ^ - -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^ - -DGGML_OPENMP=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^ - ${{ env.CMAKE_ARGS }} - cmake --build build --config Release - - - name: Pack artifacts - id: pack_artifacts - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ - Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\ - 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-bin-win-cpu-${{ matrix.arch }}.zip - name: llama-bin-win-cpu-${{ matrix.arch }}.zip - - windows: - runs-on: windows-latest - - env: - OPENBLAS_VERSION: 0.3.23 - VULKAN_VERSION: 1.4.309.0 - - strategy: - matrix: - include: - - backend: 'vulkan' - arch: 'x64' - defines: '-DGGML_VULKAN=ON' - target: 'ggml-vulkan' - - backend: 'opencl-adreno' - arch: 'arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' - target: 'ggml-opencl' - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }} - variant: ccache - evict-old-files: 1d - - - name: Install Vulkan SDK - id: get_vulkan - if: ${{ matrix.backend == 'vulkan' }} - run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" - & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install - Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" - Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }} - run: | - git clone https://github.com/KhronosGroup/OpenCL-Headers - cd OpenCL-Headers - cmake -B build ` - -DBUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build --target install - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader - cd OpenCL-ICD-Loader - cmake -B build-arm64-release ` - -A arm64 ` - -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build-arm64-release --target install --config release - - - name: Build - id: cmake_build - run: | - cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF - cmake --build build --config Release --target ${{ matrix.target }} - - - name: Pack artifacts - id: pack_artifacts - run: | - 7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip - name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip - - windows-cuda: - runs-on: windows-2022 - - strategy: - matrix: - cuda: ['12.4'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-cuda-${{ matrix.cuda }} - variant: ccache - evict-old-files: 1d - - - name: Install Cuda Toolkit - uses: ./.github/actions/windows-setup-cuda - with: - cuda_version: ${{ matrix.cuda }} - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Build - id: cmake_build - shell: cmd - run: | - call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 - cmake -S . -B build -G "Ninja Multi-Config" ^ - -DGGML_BACKEND_DL=ON ^ - -DGGML_NATIVE=OFF ^ - -DGGML_CPU=OFF ^ - -DGGML_CUDA=ON ^ - -DLLAMA_CURL=OFF - set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda - - - name: Pack artifacts - id: pack_artifacts - run: | - 7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - - - name: Copy and pack Cuda runtime - run: | - echo "Cuda install location: ${{ env.CUDA_PATH }}" - $dst='.\build\bin\cudart\' - robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - 7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\* - - - name: Upload Cuda runtime - uses: actions/upload-artifact@v4 - with: - path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - - windows-sycl: - runs-on: windows-latest - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-sycl - variant: ccache - evict-old-files: 1d - - - name: Install - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - - name: Build - id: cmake_build - shell: cmd - run: | - call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force - cmake -G "Ninja" -B build ^ - -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ - -DCMAKE_BUILD_TYPE=Release ^ - -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ - -DGGML_CPU=OFF -DGGML_SYCL=ON ^ - -DLLAMA_CURL=OFF - cmake --build build --target ggml-sycl -j - - - name: Build the release package - id: pack_artifacts - run: | - echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin - - echo "cp oneAPI running time dll files to ./build/bin done" - 7z a llama-bin-win-sycl-x64.zip ./build/bin/* - - - name: Upload the release package - uses: actions/upload-artifact@v4 - with: - path: llama-bin-win-sycl-x64.zip - name: llama-bin-win-sycl-x64.zip - - windows-hip: - runs-on: windows-latest - - strategy: - matrix: - include: - - name: "radeon" - gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Clone rocWMMA repository - id: clone_rocwmma - run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-hip-${{ matrix.name }}-x64 - evict-old-files: 1d - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Build - id: cmake_build - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" ` - -DCMAKE_BUILD_TYPE=Release ` - -DGGML_BACKEND_DL=ON ` - -DGGML_NATIVE=OFF ` - -DGGML_CPU=OFF ` - -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" ` - -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_HIP=ON ` - -DLLAMA_CURL=OFF - cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} - md "build\bin\rocblas\library\" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" - - - name: Pack artifacts - id: pack_artifacts - run: | - 7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-bin-win-hip-${{ matrix.name }}-x64.zip - name: llama-bin-win-hip-${{ matrix.name }}-x64.zip - - ios-xcode-build: - runs-on: macos-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - - - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Pack artifacts - id: pack_artifacts - run: | - zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-xcframework.zip - name: llama-${{ steps.tag.outputs.name }}-xcframework - - release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - - # Fine-grant permission - # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token - permissions: - contents: write # for creating release - - runs-on: ubuntu-latest - - needs: - - windows - - windows-cpu - - windows-cuda - - windows-sycl - - windows-hip - - ubuntu-22-cpu - - ubuntu-22-vulkan - - macOS-arm64 - - macOS-x64 - - ios-xcode-build - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - - name: Download artifacts - id: download-artifact - uses: actions/download-artifact@v4 - with: - path: ./artifact - merge-multiple: true - - - name: Move artifacts - id: move_artifacts - run: | - mkdir -p release - - echo "Adding CPU backend files to existing zips..." - for arch in x64 arm64; do - cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip" - temp_dir=$(mktemp -d) - echo "Extracting CPU backend for $arch..." - unzip "$cpu_zip" -d "$temp_dir" - - echo "Adding CPU files to $arch zips..." - for target_zip in artifact/llama-bin-win-*-${arch}.zip; do - if [[ "$target_zip" == "$cpu_zip" ]]; then - continue - fi - echo "Adding CPU backend to $(basename "$target_zip")" - realpath_target_zip=$(realpath "$target_zip") - (cd "$temp_dir" && zip -r "$realpath_target_zip" .) - done - - rm -rf "$temp_dir" - done - - echo "Renaming and moving zips to release..." - for zip_file in artifact/llama-bin-win-*.zip; do - base_name=$(basename "$zip_file" .zip) - zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip" - echo "Moving $zip_file to release/$zip_name" - mv "$zip_file" "release/$zip_name" - done - - echo "Moving other artifacts..." - mv -v artifact/*.zip release - - - name: Create release - id: create_release - uses: ggml-org/action-create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ steps.tag.outputs.name }} - - - name: Upload release - id: upload_release - uses: actions/github-script@v3 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - const path = require('path'); - const fs = require('fs'); - const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./release')) { - if (path.extname(file) === '.zip') { - console.log('uploadReleaseAsset', file); - await github.repos.uploadReleaseAsset({ - owner: context.repo.owner, - repo: context.repo.repo, - release_id: release_id, - name: file, - data: await fs.readFileSync(`./release/${file}`) - }); - } - } diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml deleted file mode 100644 index f6da488576937..0000000000000 --- a/.github/workflows/server.yml +++ /dev/null @@ -1,237 +0,0 @@ -# Server build and tests -name: Server - -on: - workflow_dispatch: # allows manual triggering - inputs: - sha: - description: 'Commit SHA1 to build' - required: false - type: string - slow_tests: - description: 'Run slow tests' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] - -env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - server: - runs-on: ubuntu-latest - - strategy: - matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken - build_type: [RelWithDebInfo] - include: - - build_type: Release - sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - - steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libcurl4-openssl-dev - - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r tools/server/tests/requirements.txt - - # Setup nodejs (to be used for verifying bundled index.html) - - uses: actions/setup-node@v4 - with: - node-version: '22.11.0' - - - name: WebUI - Install dependencies - id: webui_lint - run: | - cd tools/server/webui - npm ci - - - name: WebUI - Check code format - id: webui_format - run: | - git config --global --add safe.directory $(realpath .) - cd tools/server/webui - git status - - npm run format - git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Files do not follow coding style. To fix: npm run format" - echo "${modified_files}" - exit 1 - fi - - - name: Verify bundled index.html - id: verify_server_index_html - run: | - git config --global --add safe.directory $(realpath .) - cd tools/server/webui - git status - - npm run build - git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Repository is dirty or server/webui is not built as expected" - echo "Hint: You may need to follow Web UI build guide in server/README.md" - echo "${modified_files}" - exit 1 - fi - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build_sanitizers - if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build - if: ${{ matrix.sanitizer == '' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Tests - id: server_integration_tests - if: ${{ matrix.sanitizer == '' }} - env: - GITHUB_ACTIONS: "true" - run: | - cd tools/server/tests - ./tests.sh - - - name: Tests (sanitizers) - id: server_integration_tests_sanitizers - if: ${{ matrix.sanitizer != '' }} - run: | - cd tools/server/tests - LLAMA_SANITIZE=1 ./tests.sh - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd tools/server/tests - SLOW_TESTS=1 ./tests.sh - - - server-windows: - runs-on: windows-2022 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r tools/server/tests/requirements.txt - - - name: Copy Libcurl - id: prepare_libcurl - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll - - - name: Tests - id: server_integration_tests - if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} - run: | - cd tools/server/tests - $env:PYTHONIOENCODING = ":replace" - pytest -v -x -m "not slow" - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd tools/server/tests - $env:SLOW_TESTS = "1" - pytest -v -x diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml deleted file mode 100644 index 5c286155951e5..0000000000000 --- a/.github/workflows/winget.yml +++ /dev/null @@ -1,42 +0,0 @@ -name: Update Winget Package - -on: - workflow_dispatch: # allows manual triggering - schedule: - - cron: '28 5 * * *' # Update every day at 5:28 UTC - -jobs: - update: - name: Update Winget Package - runs-on: ubuntu-latest - - steps: - - name: Install cargo binstall - uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b - - - name: Install komac - run: | - cargo binstall komac@2.11.2 -y - - - name: Find latest release - id: find_latest_release - uses: actions/github-script@v6 - with: - script: | - const { data: releases } = await github.rest.repos.listReleases({ - owner: context.repo.owner, - repo: context.repo.repo, - }); - console.log("Latest release:", releases[0].tag_name); - return releases[0].tag_name; - - - name: Update manifest - env: - VERSION: ${{ steps.find_latest_release.outputs.result }} - run: | - echo "Updating manifest..." - komac update --version ${{ env.VERSION }} \ - --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \ - --token ${{ secrets.WINGET_GITHUB_TOKEN }} \ - --submit \ - ggml.llamacpp diff --git a/CMakePresets.json b/CMakePresets.json index e9844701304fc..0228fc71edfc6 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -30,6 +30,8 @@ { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, + { "name": "remoting_frontend", "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND": "ON" } }, + { "name": "remoting_backend", "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND": "ON" } }, { "name": "x64-windows-llvm", "hidden": true, diff --git a/Makefile b/Makefile index ac442aec095d6..128c684c5edd5 100644 --- a/Makefile +++ b/Makefile @@ -716,6 +716,16 @@ ggml/src/ggml-cuda/ggml-cuda.o: \ $(NVCC_COMPILE) endif # GGML_CUDA +ifdef GGML_REMOTING_FRONTEND + MK_CPPFLAGS += -DGGML_USE_REMOTINGFRONTEND + OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o +endif + +ifdef GGML_REMOTING_BACKEND + MK_CPPFLAGS += -DGGML_USE_REMOTINGBACKEND + OBJ_GGML_EXT += ggml/src/ggml-remotingbackend/ggml-remoting-backend.o +endif + ifdef GGML_VULKAN MK_CPPFLAGS += -DGGML_USE_VULKAN MK_LDFLAGS += $(shell pkg-config --libs vulkan) @@ -755,6 +765,12 @@ _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ +ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +ggml/src/ggml-remotingbackend/backend.o: ggml/src/ggml-remotingbackend/backend.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(_ggml_vk_header): $(_ggml_vk_source) $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen diff --git a/build-xcframework.sh b/build-xcframework.sh deleted file mode 100755 index a08419a801b47..0000000000000 --- a/build-xcframework.sh +++ /dev/null @@ -1,541 +0,0 @@ -#!/bin/bash -# -# Options -IOS_MIN_OS_VERSION=16.4 -MACOS_MIN_OS_VERSION=13.3 -VISIONOS_MIN_OS_VERSION=1.0 -TVOS_MIN_OS_VERSION=16.4 - -BUILD_SHARED_LIBS=OFF -LLAMA_BUILD_EXAMPLES=OFF -LLAMA_BUILD_TOOLS=OFF -LLAMA_BUILD_TESTS=OFF -LLAMA_BUILD_SERVER=OFF -GGML_METAL=ON -GGML_METAL_EMBED_LIBRARY=ON -GGML_BLAS_DEFAULT=ON -GGML_METAL_USE_BF16=ON -GGML_OPENMP=OFF - -COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" -COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" - -# Common options for all builds -COMMON_CMAKE_ARGS=( - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY="" - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO - -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym" - -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES - -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO - -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES} - -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS} - -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} - -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER} - -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY} - -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT} - -DGGML_METAL=${GGML_METAL} - -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16} - -DGGML_NATIVE=OFF - -DGGML_OPENMP=${GGML_OPENMP} -) - -XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }') -MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1) -MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2) -echo "Detected Xcode version: $XCODE_VERSION" - -check_required_tool() { - local tool=$1 - local install_message=$2 - - if ! command -v $tool &> /dev/null; then - echo "Error: $tool is required but not found." - echo "$install_message" - exit 1 - fi -} -echo "Checking for required tools..." -check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)" -check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" -check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)" -check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" - -set -e - -## Clean up previous builds -rm -rf build-apple -rm -rf build-ios-sim -rm -rf build-ios-device -rm -rf build-macos -rm -rf build-visionos -rm -rf build-visionos-sim -rm -rf build-tvos-sim -rm -rf build-tvos-device - -# Setup the xcframework build directory structure -setup_framework_structure() { - local build_dir=$1 - local min_os_version=$2 - local platform=$3 # "ios", "macos", "visionos", or "tvos" - local framework_name="llama" - - echo "Creating ${platform}-style framework structure for ${build_dir}" - - if [[ "$platform" == "macos" ]]; then - # macOS versioned structure uses versioned directories - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources - - # Create symbolic links - ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current - ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers - ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules - ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources - ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name} - - # Set header and module paths - local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/ - local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/ - else - # iOS/VisionOS/tvOS use a flat structure - mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers - mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules - - # Remove any existing structure to ensure clean build - rm -rf ${build_dir}/framework/${framework_name}.framework/Versions - - # Set header and module paths - local header_path=${build_dir}/framework/${framework_name}.framework/Headers/ - local module_path=${build_dir}/framework/${framework_name}.framework/Modules/ - fi - - # Copy all required headers (common for all platforms) - cp include/llama.h ${header_path} - cp ggml/include/ggml.h ${header_path} - cp ggml/include/ggml-opt.h ${header_path} - cp ggml/include/ggml-alloc.h ${header_path} - cp ggml/include/ggml-backend.h ${header_path} - cp ggml/include/ggml-metal.h ${header_path} - cp ggml/include/ggml-cpu.h ${header_path} - cp ggml/include/ggml-blas.h ${header_path} - cp ggml/include/gguf.h ${header_path} - - # Create module map (common for all platforms) - cat > ${module_path}module.modulemap << EOF -framework module llama { - header "llama.h" - header "ggml.h" - header "ggml-alloc.h" - header "ggml-backend.h" - header "ggml-metal.h" - header "ggml-cpu.h" - header "ggml-blas.h" - header "gguf.h" - - link "c++" - link framework "Accelerate" - link framework "Metal" - link framework "Foundation" - - export * -} -EOF - - # Platform-specific settings for Info.plist - local platform_name="" - local sdk_name="" - local supported_platform="" - - case "$platform" in - "ios") - platform_name="iphoneos" - sdk_name="iphoneos${min_os_version}" - supported_platform="iPhoneOS" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family=' UIDeviceFamily - - 1 - 2 - ' - ;; - "macos") - platform_name="macosx" - sdk_name="macosx${min_os_version}" - supported_platform="MacOSX" - local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist" - local device_family="" - ;; - "visionos") - platform_name="xros" - sdk_name="xros${min_os_version}" - supported_platform="XRPlatform" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family="" - ;; - "tvos") - platform_name="appletvos" - sdk_name="appletvos${min_os_version}" - supported_platform="AppleTVOS" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family=' UIDeviceFamily - - 3 - ' - ;; - esac - - # Create Info.plist - cat > ${plist_path} << EOF - - - - - CFBundleDevelopmentRegion - en - CFBundleExecutable - llama - CFBundleIdentifier - org.ggml.llama - CFBundleInfoDictionaryVersion - 6.0 - CFBundleName - llama - CFBundlePackageType - FMWK - CFBundleShortVersionString - 1.0 - CFBundleVersion - 1 - MinimumOSVersion - ${min_os_version} - CFBundleSupportedPlatforms - - ${supported_platform} - ${device_family} - DTPlatformName - ${platform_name} - DTSDKName - ${sdk_name} - - -EOF -} - -# Create dynamic libraries from static libraries. -combine_static_libraries() { - local build_dir="$1" - local release_dir="$2" - local platform="$3" # "ios", "macos", "visionos", or "tvos" - local is_simulator="$4" - local base_dir="$(pwd)" - local framework_name="llama" - - # Determine output path based on platform - local output_lib="" - if [[ "$platform" == "macos" ]]; then - # macOS uses versioned structure - output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}" - else - # iOS, visionOS, and tvOS use a directory flat structure - output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}" - fi - - local libs=( - "${base_dir}/${build_dir}/src/${release_dir}/libllama.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a" - "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a" - "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a" - ) - - # Create temporary directory for processing - local temp_dir="${base_dir}/${build_dir}/temp" - mkdir -p "${temp_dir}" - - # Since we have multiple architectures libtool will find object files that do not - # match the target architecture. We suppress these warnings. - libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null - - # Determine SDK, architectures, and install_name based on platform and simulator flag. - local sdk="" - local archs="" - local min_version_flag="" - local install_name="" - - case "$platform" in - "ios") - if [[ "$is_simulator" == "true" ]]; then - sdk="iphonesimulator" - archs="arm64 x86_64" - min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}" - else - sdk="iphoneos" - archs="arm64" - min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}" - fi - install_name="@rpath/llama.framework/llama" - ;; - "macos") - sdk="macosx" - archs="arm64 x86_64" - min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}" - install_name="@rpath/llama.framework/Versions/Current/llama" - ;; - "visionos") - if [[ "$is_simulator" == "true" ]]; then - sdk="xrsimulator" - archs="arm64 x86_64" - min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator" - else - sdk="xros" - archs="arm64" - min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}" - fi - # Use flat structure for visionOS, same as iOS - install_name="@rpath/llama.framework/llama" - ;; - "tvos") - if [[ "$is_simulator" == "true" ]]; then - sdk="appletvsimulator" - archs="arm64 x86_64" - min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}" - else - sdk="appletvos" - archs="arm64" - min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}" - fi - install_name="@rpath/llama.framework/llama" - ;; - esac - - # Build architecture flags - local arch_flags="" - for arch in $archs; do - arch_flags+=" -arch $arch" - done - - # Create dynamic library - echo "Creating dynamic library for ${platform}." - xcrun -sdk $sdk clang++ -dynamiclib \ - -isysroot $(xcrun --sdk $sdk --show-sdk-path) \ - $arch_flags \ - $min_version_flag \ - -Wl,-force_load,"${temp_dir}/combined.a" \ - -framework Foundation -framework Metal -framework Accelerate \ - -install_name "$install_name" \ - -o "${base_dir}/${output_lib}" - - # Platform-specific post-processing for device builds - if [[ "$is_simulator" == "false" ]]; then - if command -v xcrun vtool &>/dev/null; then - case "$platform" in - "ios") - echo "Marking binary as a framework binary for iOS..." - xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - "visionos") - echo "Marking binary as a framework binary for visionOS..." - if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then - echo "Xcode version greater than 16.2, using visionOS." - VISION_OS_BUILD_VERSION="visionos" - else - echo "Xcode version less than or equal to 16.2, using xros." - VISION_OS_BUILD_VERSION="xros" - fi - xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - "tvos") - echo "Marking binary as a framework binary for tvOS..." - xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - esac - else - echo "Warning: vtool not found. Binary may not pass App Store validation." - fi - fi - - echo "Creating properly formatted dSYM..." - # Create a separate directory for dSYMs for all platforms - mkdir -p "${base_dir}/${build_dir}/dSYMs" - - # iOS and visionOS style dSYM (flat structure) - if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then - # Generate dSYM in the dSYMs directory - xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" - - # Create a copy of the binary that will be stripped - cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip" - - # Strip debug symbols from the copy - xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib" - - # Replace the original with the stripped version - mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" - else - # macOS style dSYM - # First strip debug info to a separate file - xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib" - - # Generate dSYM in the dSYMs directory - xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" - - # Replace original binary with stripped version - mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" - fi - - # Remove any automatically generated dSYM files in the framework structure as they will - # otherwise case Invalid Bundle Structure validation errors. - if [ -d "${base_dir}/${output_lib}.dSYM" ]; then - echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM" - rm -rf "${base_dir}/${output_lib}.dSYM" - fi - - # Clean up - rm -rf "${temp_dir}" -} - -echo "Building for iOS simulator..." -cmake -B build-ios-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ - -DIOS=ON \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_SYSROOT=iphonesimulator \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-ios-sim --config Release -- -quiet - -echo "Building for iOS devices..." -cmake -B build-ios-device -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_SYSROOT=iphoneos \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-ios-device --config Release -- -quiet - -echo "Building for macOS..." -cmake -B build-macos -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-macos --config Release -- -quiet - -echo "Building for visionOS..." -cmake -B build-visionos -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_SYSROOT=xros \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-visionos --config Release -- -quiet - -echo "Building for visionOS simulator..." -cmake -B build-visionos-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_SYSROOT=xrsimulator \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-visionos-sim --config Release -- -quiet - -# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS) -echo "Building for tvOS simulator..." -cmake -B build-tvos-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_SYSROOT=appletvsimulator \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DGGML_METAL=ON \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-tvos-sim --config Release -- -quiet - -echo "Building for tvOS devices..." -cmake -B build-tvos-device -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_SYSROOT=appletvos \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DGGML_METAL=ON \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-tvos-device --config Release -- -quiet - -# Setup frameworks and copy binaries and headers -echo "Setting up framework structures..." -setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios" -setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios" -setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos" -setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos" -setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos" -setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos" -setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos" - -# Create dynamic libraries from static libraries -echo "Creating dynamic libraries from static libraries..." -combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true" -combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false" -combine_static_libraries "build-macos" "Release" "macos" "false" -combine_static_libraries "build-visionos" "Release-xros" "visionos" "false" -combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true" -combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true" -combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false" - -# Create XCFramework with correct debug symbols paths -echo "Creating XCFramework..." -xcodebuild -create-xcframework \ - -framework $(pwd)/build-ios-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \ - -framework $(pwd)/build-ios-device/framework/llama.framework \ - -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \ - -framework $(pwd)/build-macos/framework/llama.framework \ - -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \ - -framework $(pwd)/build-visionos/framework/llama.framework \ - -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \ - -framework $(pwd)/build-visionos-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \ - -framework $(pwd)/build-tvos-device/framework/llama.framework \ - -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \ - -framework $(pwd)/build-tvos-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \ - -output $(pwd)/build-apple/llama.xcframework diff --git a/build.backend.sh b/build.backend.sh new file mode 100755 index 0000000000000..2904c4a15c73f --- /dev/null +++ b/build.backend.sh @@ -0,0 +1,36 @@ +# force isatty-->true, so that $0 |& head -50 has colors ... +rm -f READY_backend FAILED_backend + +echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc - +export LD_PRELOAD=/tmp/isatty.so + +if [[ "${PERF_MODE:-}" ]]; then + FLAVOR="-prod" +else + FLAVOR="" +fi + +export SDKROOT=$(xcrun --sdk macosx --show-sdk-path) + +if [[ "$FLAVOR" == "-prod" ]]; then + cat < +#undef GGML_LOG_DEBUG +#define GGML_LOG_DEBUG(...) + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -1076,8 +1079,6 @@ @implementation GGMLMetalClass GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } \ - } else { \ - GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; @@ -5139,9 +5140,53 @@ static bool ggml_metal_encode_node( return true; } +long long timer_start; +long long timer_total; +long long timer_count; + +static inline void start_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer_total += (timer_end - timer_start); + timer_count += 1; +} + +static void show_timer(void) { + double ms = timer_total/1000000; + double itl = ms/timer_count; + double speed = 1/itl * 1000; + + printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed); + + timer_start = 0; + timer_total = 1; // to avoid re-registering + timer_count = 0; +} + +static void show_timer_signal(int sig) { + GGML_UNUSED(sig); + show_timer(); +} + static enum ggml_status ggml_metal_graph_compute( ggml_backend_t backend, struct ggml_cgraph * gf) { + + if (timer_total == 0) { + signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid) + atexit(show_timer); + } + + start_timer(); + struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -5269,6 +5314,8 @@ static enum ggml_status ggml_metal_graph_compute( } } + stop_timer(); + return GGML_STATUS_SUCCESS; } diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt new file mode 100644 index 0000000000000..7e374d395f68c --- /dev/null +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable API Remoting backend") + +ggml_add_backend_library(ggml-remotingbackend + backend.cpp + backend-dispatched.cpp + backend-dispatched-backend.cpp + backend-dispatched-device.cpp + backend-dispatched-buffer.cpp + backend-dispatched-buffer-type.cpp + backend-utils.cpp + shared/api_remoting.h + shared/apir_backend.h + shared/venus_cs.h + venus_cs_ggml-rpc-back.cpp + ) + +target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h new file mode 100644 index 0000000000000..b45c2784160ac --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-convert.h @@ -0,0 +1,15 @@ +#include "shared/apir_backend.h" + +#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name) + +static inline apir_buffer_host_handle_t +ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_host_handle_t) buffer; +} + +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp new file mode 100644 index 0000000000000..f15f39c7f92d8 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -0,0 +1,57 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +#include "shared/apir_backend.h" + +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + +uint32_t +backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&graph_compute_timer); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + size_t cgraph_size; + vn_decode_size_t(dec, &cgraph_size); + + struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size); + + ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size); + + ggml_status status; +#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 + for (int idx = 0; idx < cgraph->n_nodes; idx++) { + ggml_tensor *op = ggml_graph_node(cgraph, idx); + if (dev->iface.supports_op(dev, op)) { + continue; + } + ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op)); + + status = GGML_STATUS_ABORTED; + vn_encode_ggml_status(enc, &status); + + stop_timer(&graph_compute_timer); + return 0; + } +#endif + status = bck->iface.graph_compute(bck, cgraph); + + vn_encode_ggml_status(enc, &status); + + stop_timer(&graph_compute_timer); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp new file mode 100644 index 0000000000000..f925d1e066fc0 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -0,0 +1,81 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t +backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + const char *string = buft->iface.get_name(buft); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_alignment(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t value = buft->iface.get_max_size(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + bool is_host = buft->iface.is_host(buft); + vn_encode_bool_t(enc, &is_host); + + return 0; +} + +uint32_t +backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buffer_type(dec); + + size_t size; + vn_decode_size_t(dec, &size); + + ggml_backend_buffer_t buffer; + + buffer = buft->iface.alloc_buffer(buft, size); + + vn_encode_ggml_buffer(enc, buffer); + + if (buffer) { + track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp new file mode 100644 index 0000000000000..fc1ccaef6748d --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -0,0 +1,143 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + +uint32_t +backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); + vn_encode_uintptr_t(enc, &base); + + return 0; +} + +uint32_t +backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&set_tensor_timer); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + +#if 0 + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer, tensor, shmem_data, offset, size); +#endif +#if 0 + void **addr = (void **)(uintptr_t) shmem_data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif + + buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size); + + stop_timer(&set_tensor_timer); + + return 0; +} + +uint32_t +backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + start_timer(&get_tensor_timer); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + + const ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = vn_decode_ggml_tensor(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + UNUSED(buffer); + UNUSED(tensor); + buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); + + stop_timer(&get_tensor_timer); + + return 0; +} + +uint32_t +backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uint8_t value; + vn_decode_uint8_t(dec, &value); + + buffer->iface.clear(buffer, value); + + return 0; +} + +uint32_t +backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + if (!untrack_backend_buffer(buffer)) { + WARNING("%s: unknown buffer %p", (void *) buffer); + return 1; + } + + buffer->iface.free_buffer(buffer); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp new file mode 100644 index 0000000000000..473e9d2db7089 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -0,0 +1,142 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(ctx); + UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + vn_encode_int32_t(enc, &dev_count); + + return 0; +} + +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + const char *string = dev->iface.get_name(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + const char *string = dev->iface.get_description(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + uint32_t type = dev->iface.get_type(dev); + vn_encode_uint32_t(enc, &type); + + return 0; +} + +uint32_t +backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + + vn_encode_size_t(enc, &free); + vn_encode_size_t(enc, &total); + + return 0; +} + +uint32_t +backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); + + bool supports_op = dev->iface.supports_op(dev, op); + + vn_encode_bool_t(enc, &supports_op); + + return 0; +} + +uint32_t +backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); + + vn_encode_ggml_buffer_type(enc, bufft); + + return 0; +} + +uint32_t +backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + struct ggml_backend_dev_props props; + dev->iface.get_props(dev, &props); + + vn_encode_bool_t(enc, &props.caps.async); + vn_encode_bool_t(enc, &props.caps.host_buffer); + vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr); + vn_encode_bool_t(enc, &props.caps.events); + + return 0; +} + +uint32_t +backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_ptr) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + size_t size; + vn_decode_size_t(dec, &size); + size_t max_tensor_size; + vn_decode_size_t(dec, &max_tensor_size); + + ggml_backend_buffer_t buffer; + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); + + vn_encode_ggml_buffer(enc, buffer); + vn_encode_ggml_buffer_type(enc, buffer->buft); + + if (buffer) { + track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp new file mode 100644 index 0000000000000..d90424a3d714f --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -0,0 +1,47 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +#include "ggml-metal.h" + +ggml_backend_reg_t reg = NULL; +ggml_backend_dev_t dev = NULL; +ggml_backend_t bck = NULL; + +long long timer_start = 0; +long long timer_total = 0; +long long timer_count = 0; + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) { + if (reg != NULL) { + FATAL("%s: already initialized :/", __func__); + } + ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; + + reg = ggml_backend_reg_fct(); + if (reg == NULL) { + FATAL("%s: backend registration failed :/", __func__); + } + + if (reg->iface.get_device_count(reg)) { + dev = reg->iface.get_device(reg, 0); + } + + ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p; + + bck = ggml_backend_fct(0); + if (!bck) { + ERROR("%s: backend initialization failed :/", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_FAILED; + } + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); + + return APIR_BACKEND_INITIALIZE_SUCCESSS; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h new file mode 100644 index 0000000000000..3c164b532ac95 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include + +#include + +#include "backend-utils.h" +#include "backend-convert.h" +#include "shared/apir_backend.h" +#include "shared/venus_cs.h" +#include "shared/venus_cs_ggml.h" + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); + +typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* *** */ + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* device */ +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* buffer-type */ +uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* buffer */ +uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +/* backend */ +uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + +static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) +{ + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr"; + + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer"; + + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer"; + + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute"; + default: return "unknown"; + } +} + +static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { + /* device */ + [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count, + [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name, + [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, + [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, + [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, + [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, + [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type, + [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props, + [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr, + + /* buffer-type */ + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size, + [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host, + [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer, + + /* buffer */ + [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base, + [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, + [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor, + [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear, + [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer, + + /* backend */ + [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute, +}; diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h new file mode 100644 index 0000000000000..41bc42dbc0e36 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -0,0 +1,29 @@ +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +extern ggml_backend_reg_t reg; +extern ggml_backend_dev_t dev; +extern ggml_backend_t bck; + +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) + +extern "C" { + uint32_t apir_backend_initialize(); + void apir_backend_deinit(void); + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after); +} diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h new file mode 100644 index 0000000000000..b032061a96947 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-utils.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include + +#include + +#define UNUSED GGML_UNUSED + +inline void +INFO(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +WARNING(const char *format, ...) { + fprintf(stderr, "WARNING: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +ERROR(const char *format, ...) { + fprintf(stderr, "ERROR: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +FATAL(const char *format, ...) { + fprintf(stderr, "FATAL: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); + if (format) + assert(false); +} diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp new file mode 100644 index 0000000000000..95dee556cff3f --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -0,0 +1,134 @@ +#include +#include + +#include + +#include "backend-utils.h" +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "shared/apir_backend.h" +#include "shared/venus_cs.h" + +#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH" +#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG" +#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT" + + +static void *backend_library_handle = NULL; + +extern "C" { + void apir_backend_deinit(void) { + auto buffers = get_track_backend_buffers(); + for (const auto& buffer: buffers) { + untrack_backend_buffer(buffer); + buffer->iface.free_buffer(buffer); + } + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); + + show_timer(&graph_compute_timer); + show_timer(&set_tensor_timer); + show_timer(&get_tensor_timer); + /* *** */ + + if (backend_library_handle) { + INFO("%s: The GGML backend library was loaded. Unloading it.", __func__); + dlclose(backend_library_handle); + } + + INFO("%s: bye-bye", __func__); + } + + uint32_t apir_backend_initialize() { + const char* dlsym_error; + + const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV); + const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV); + const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV); + + INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init); + + if (!library_name) { + ERROR("Cannot open library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + backend_library_handle = dlopen(library_name, RTLD_LAZY); + + if (!backend_library_handle) { + ERROR("Cannot open library: %s\n", dlerror()); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + if (!library_reg) { + ERROR("Cannot register library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_REG_ENV); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("Cannot load symbol: %s\n", dlsym_error); + + return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; + } + + if (!library_init) { + ERROR("Cannot initialize library: env var '%s' not defined\n", library_init); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("Cannot load symbol: %s\n", dlsym_error); + + return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; + } + + return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); + } + + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after) { + struct vn_cs_encoder _enc = { + .cur = enc_cur, + .end = enc_end, + }; + struct vn_cs_encoder *enc = &_enc; + + struct vn_cs_decoder _dec = { + .cur = dec_cur, + .end = dec_end, + }; + struct vn_cs_decoder *dec = &_dec; + + + if (cmd_type > APIR_BACKEND_DISPATCH_TABLE_COUNT) { + ERROR("Received an invalid dispatch index (%d > %d)\n", + cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + return APIR_BACKEND_FORWARD_INDEX_INVALID; + } + +#if 0 + static long long count = 0; + INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type)); + count += 1; +#endif + backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; + uint32_t ret = forward_fct(enc, dec, ctx); + + *enc_cur_after = enc->cur; + + return ret; + } +} diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h new file mode 100644 index 0000000000000..6e594a8ae4ab8 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h @@ -0,0 +1,12 @@ +#define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255 +#define VIRGL_APIR_COMMAND_TYPE_Forward 256 + + +static inline const char *api_remoting_command_name(int32_t type) +{ + switch (type) { + case VIRGL_APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; + case VIRGL_APIR_COMMAND_TYPE_Forward: return "Forward"; + default: return "unknown"; + } +} diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h new file mode 100644 index 0000000000000..80e5961ff04b5 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -0,0 +1,108 @@ +#pragma once + +#define APIR_BACKEND_INITIALIZE_SUCCESSS 0 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 +#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 +#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 +#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5 + +#define APIR_BACKEND_FORWARD_INDEX_INVALID 6 + +// 1 is fast, 0 avoid micro-benchmark crashes +#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 1 + +// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received +#define APIR_BACKEND_CHECK_SUPPORTS_OP 0 + +typedef uintptr_t apir_buffer_type_host_handle_t; +typedef uintptr_t apir_buffer_host_handle_t; + +typedef struct { + apir_buffer_host_handle_t host_handle; + + struct vn_renderer_shmem *shmem; + apir_buffer_type_host_handle_t buft_host_handle; +} apir_buffer_context_t; + +struct vn_dispatch_context; +struct virgl_apir_context; + +typedef enum ApirBackendCommandType { + /* device */ + APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, + APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1, + APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, + APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, + APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, + APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, + APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6, + APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7, + APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8, + + /* buffer-type */ + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12, + APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13, + + /* buffer */ + APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14, + APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15, + APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16, + APIR_COMMAND_TYPE_BUFFER_CLEAR = 17, + APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18, + + /* backend */ + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19, + + // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 20, +} ApirBackendCommandType; + + +struct virgl_apir_callbacks { + void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id); +}; + +struct virgl_apir_context { + struct vn_dispatch_context *virgl_ctx; + + struct virgl_apir_callbacks iface; +}; + +struct timer_data { + long long start; + long long total; + long long count; + const char *name; +}; + +extern struct timer_data graph_compute_timer; +extern struct timer_data get_tensor_timer; +extern struct timer_data set_tensor_timer; + +static inline void start_timer(struct timer_data *timer) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(struct timer_data *timer) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer->total += (timer_end - timer->start); + timer->count += 1; +} + +static inline void show_timer(struct timer_data *timer) { + double ms = timer->total/1000000; + double itl = ms/timer->count; + double speed = 1/itl * 1000; + + INFO("%14s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s", + timer->name, ms, timer->count, itl, speed); +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h new file mode 100644 index 0000000000000..e67c99a46b5b6 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -0,0 +1,554 @@ +#pragma once + +#include +#include + +// needs UNUSED to be defined +// needs FATAL to be defined + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct vn_cs_encoder { + char* cur; + const char *start; + const char* end; +}; + +struct vn_cs_decoder { + const char* cur; + const char* end; +}; + +/* + * new encoder and decoder + */ + +static struct vn_cs_decoder +vn_cs_new_decoder(const char *ptr, size_t size) { + struct vn_cs_decoder dec = { + .cur = ptr, + .end = ptr + size, + }; + + return dec; +} + +static struct vn_cs_encoder +vn_cs_new_encoder(char *ptr, size_t size) { + struct vn_cs_encoder enc = { + .cur = ptr, + .start = ptr, + .end = ptr + size, + }; + + return enc; +} + +/* + * encode peek + */ + +static inline bool +vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + assert(val_size <= size); + + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("READING TOO MUCH FROM THE DECODER :/"); + //vn_cs_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; +} + +static inline void +vn_cs_decoder_peek(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + vn_cs_decoder_peek_internal(dec, size, val, val_size); +} + +static inline const void * +vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec, + size_t size) +{ + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("READING TOO MUCH FROM THE DECODER :/"); + } + const void *addr = dec->cur; + dec->cur += size; + + return addr; +} + +/* + * read/write + */ + +static inline void +vn_cs_decoder_read(struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) + dec->cur += size; +} + +static inline char * +vn_cs_encoder_write(struct vn_cs_encoder *enc, + size_t size, + const void *val, + size_t val_size) +{ + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); + + char *write_addr = enc->cur; + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(write_addr, val, val_size); + enc->cur += size; + + return write_addr; +} + +/* + * encode/decode + */ + +static inline void +vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) +{ + assert(size % 4 == 0); + vn_cs_decoder_read(dec, size, data, data_size); +} + +static inline void +vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) +{ + assert(size % 4 == 0); + /* TODO check if the generated code is optimal */ + vn_cs_encoder_write(enc, size, data, data_size); +} + +/* + * typed encode/decode + */ + +/* uint8_t */ + +static inline void +vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(*val)); +} + +static inline void +vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(*val)); +} + +/* uint64_t */ + +static inline size_t +vn_sizeof_uint64_t(const uint64_t *val) +{ + assert(sizeof(*val) == 8); +#ifdef NDEBUG + UNUSED(val); +#endif + return 8; +} + +static inline void +vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) +{ + vn_encode(enc, 8, val, sizeof(*val)); +} + +static inline void +vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) +{ + vn_decode(dec, 8, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) +{ + assert(sizeof(*val) == 8); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +static inline const uint64_t * +vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count) +{ + return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t)); +} + +/* int32_t */ + +static inline size_t +vn_sizeof_int32_t(const int32_t *val) +{ + assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif + return 4; +} + +static inline void +vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* array size (uint64_t) */ + +static inline size_t +vn_sizeof_array_size(uint64_t size) +{ + return vn_sizeof_uint64_t(&size); +} + +static inline void +vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) +{ + vn_encode_uint64_t(enc, &size); +} + +static inline uint64_t +vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + if (size != expected_size) { + FATAL("ENCODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + size = 0; + } + return size; +} + +static inline uint64_t +vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + return size; +} + +static inline uint64_t +vn_peek_array_size(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); + return size; +} + +/* non-array pointer */ + +static inline size_t +vn_sizeof_simple_pointer(const void *val) +{ + return vn_sizeof_array_size(val ? 1 : 0); +} + +static inline bool +vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) +{ + vn_encode_array_size(enc, val ? 1 : 0); + return val; +} + +static inline bool +vn_decode_simple_pointer(struct vn_cs_decoder *dec) +{ + return vn_decode_array_size_unchecked(dec); +} + +/* uint32_t */ + +static inline size_t +vn_sizeof_uint32_t(const uint32_t *val) +{ + assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif + return 4; +} + +static inline void +vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* size_t */ + +static inline size_t +vn_sizeof_size_t(const size_t *val) +{ + return sizeof(*val); +} + +static inline void +vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val) +{ + const uint64_t tmp = *val; + vn_encode_uint64_t(enc, &tmp); +} + +static inline void +vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val) +{ + uint64_t tmp; + vn_decode_uint64_t(dec, &tmp); + *val = tmp; +} + +static inline size_t +vn_sizeof_size_t_array(const size_t *val, uint32_t count) +{ + return vn_sizeof_size_t(val) * count; +} + +static inline void +vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_encode_uint64_t_array(enc, (const uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_encode_size_t(enc, &val[i]); + } +} + +static inline void +vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_decode_uint64_t_array(dec, (uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_decode_size_t(dec, &val[i]); + } +} + +/* opaque blob */ + +static inline size_t +vn_sizeof_blob_array(const void *val, size_t size) +{ + UNUSED(val); + return (size + 3) & ~3; +} + +static inline void +vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size) +{ + vn_encode(enc, (size + 3) & ~3, val, size); +} + +static inline void +vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size) +{ + vn_decode(dec, (size + 3) & ~3, val, size); +} + +/* string */ + +static inline size_t +vn_sizeof_char_array(const char *val, size_t size) +{ + return vn_sizeof_blob_array(val, size); +} + +static inline void +vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size) +{ + assert(size && strlen(val) < size); + vn_encode_blob_array(enc, val, size); +} + +static inline void +vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size) +{ + vn_decode_blob_array(dec, val, size); + if (size) + val[size - 1] = '\0'; + else { + //vn_cs_decoder_set_fatal(dec); + FATAL("Couldn't decode the blog array"); + } +} + +/* (temp) buffer allocation */ + +static inline void * +vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count) +{ + UNUSED(dec); + size_t alloc_size; + if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { + FATAL("overflow in array allocation of %zu * %zu bytes", size, count); + return NULL; + } + + return malloc(alloc_size); +} + +static inline void * +vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count) +{ + struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec; + return vkr_cs_decoder_alloc_array(d, size, count); +} + +/* bool */ + +static inline void +vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(bool)); +} + +static inline void +vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(bool)); +} + +/* apir_buffer_type_host_handle_t */ + +static inline void +vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +static inline void +vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); +} + +/* apir_buffer_host_handle_t */ + +static inline void +vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +static inline void +vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); +} + +/* uintptr_t */ + +static inline void +vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val) +{ + vn_encode(enc, sizeof(*val), val, sizeof(*val)); +} + +static inline void +vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val) +{ + vn_decode(dec, sizeof(*val), val, sizeof(*val)); +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp new file mode 100644 index 0000000000000..196cd70958745 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp @@ -0,0 +1,167 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor->data); + ggml_set_name(result, tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h new file mode 100644 index 0000000000000..96402287af7fc --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h @@ -0,0 +1,45 @@ +#include +#include +#include + +// ggml_tensor is serialized into rpc_tensor +struct rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; + + char padding[4]; +}; + +/* frontend */ + +rpc_tensor serialize_tensor(const ggml_tensor * tensor); + +void serialize_graph(const ggml_cgraph * cgraph, std::vector & output); + +/* backend */ + +void track_backend_buffer(ggml_backend_buffer_t buffer); +bool untrack_backend_buffer(ggml_backend_buffer_t buffer); +std::unordered_set get_track_backend_buffers(); + +void add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited); + +ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor); + +ggml_tensor *create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map); + +ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h new file mode 100644 index 0000000000000..71c9b3f3ed820 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -0,0 +1,236 @@ +// needs the ggml-backend-impl.h definition +// needs venus_cs.h definition + +#include "venus_cs_ggml-rpc.h" + +// needs +// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); + +static inline void +vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle); + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec); + +/* rpc_tensor */ + +static inline void +vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) { + size_t rpc_tensor_size = sizeof(*rpc_tensor); + vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size); +} + +static inline rpc_tensor * +vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) { + size_t rpc_tensor_size = sizeof(rpc_tensor); + + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); +} + +static inline rpc_tensor * +vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) { + size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors; + + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); +} + +/* ggml_tensor */ + +static inline void +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + rpc_tensor serialized = serialize_tensor(tensor); + + vn_encode_rcp_tensor(enc, &serialized); +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor(struct vn_cs_decoder *dec) { + const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor); + + return tensor; +} + +/* *** ggml_backend_buffer_type_t *** */ + +// ggml_backend_buffer_type_t is a POINTER (to a struct). +// Only the host pointer is shared between the host and guest. +// The guest stores it in `buft->context`. +// The host simply writes the pointer address in the buffer variable. + + +static inline void +vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) { + apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_type_t +vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return (ggml_backend_buffer_type_t) handle; +} + +static inline apir_buffer_type_host_handle_t +vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return handle; +} + +/* *** ggml_backend_type_t *** */ + +// ggml_backend_buffer_t is a POINTER. +// same logic as for ggml_backend_buffer_type_t + +static inline void +vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) { + apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec) { + ggml_backend_buffer_t buffer; + size_t buffer_ptr_size = sizeof(buffer); + + vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); + + return buffer; +} + +/* enum ggml_status */ + +static inline void +vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) { + vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status)); +} + +static inline void +vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) { + vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status)); +} + +/* vn_renderer_shmem */ + +static inline void +vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) { + vn_encode_uint32_t(enc, &shmem_res_id); +} + +static inline void +vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) { + vn_decode_uint32_t(dec, shmem_res_id); +} + +/* ggml_cgraph */ + +static inline size_t +vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector & cgraph_data) { + serialize_graph(cgraph, cgraph_data); + + return cgraph_data.size(); +} + +static inline void +vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector & cgraph_data) { + size_t cgraph_size = cgraph_data.size(); + + vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size); +} + +static inline ggml_cgraph * +vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) { + UNUSED(cgraph_size); + + uint32_t n_nodes; + vn_decode_uint32_t(dec, &n_nodes); + const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes); + + uint32_t n_tensors; + vn_decode_uint32_t(dec, &n_tensors); + const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors); + + return deserialize_graph(n_nodes, n_tensors, tensors, nodes); +} + +static inline void +vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) { + vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +} + +static inline void +vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + size_t tensor_size = sizeof(*tensor); + + if (tensor->extra) { + FATAL("Cannot pass tensors with extra"); + } + + if (tensor->src[0] && tensor->buffer) { + static int first = 1; + if (first) { + // not sure if the buffer needs to be updated inside the src tensors or not + WARNING("Cannot pass tensors with src and buffer"); + first = 0; + } + } + + vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. + // (could also make a copy of the tensor, and update locally.) + + if (tensor->buffer) { + apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); + vn_encode_ggml_buffer_handle(enc, &buffer_handle); + } + + if (tensor->view_src) { + vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); + } + + for (int i = 0; tensor->src[i]; i++) { + const ggml_tensor *tensor_src = tensor->src[i]; + vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size); + } +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { + + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. + if (tensor->buffer) { + tensor->buffer = vn_decode_ggml_buffer(dec); + } + + if (tensor->view_src) { + ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->view_src = tensor_view_src; + } + + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor + } + + return tensor; +} diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp new file mode 100644 index 0000000000000..30ae511aa95e8 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp @@ -0,0 +1,118 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "shared/venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +bool +untrack_backend_buffer(ggml_backend_buffer_t buffer) { + auto it = backend_buffers.find(buffer); + if (it == backend_buffers.end()) { + return false; + } + + backend_buffers.erase(it); + return true; +} + +std::unordered_set +get_track_backend_buffers() { + return backend_buffers; +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + uint64_t tensor_data = tensor->data; + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + + // tensor->data is serialized as an offset to the buffer base address + tensor_data += buffer_start; + + GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow + GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor_data); + ggml_set_name(result, tensor->name); + return result; +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt new file mode 100644 index 0000000000000..f3f3dea652cf9 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable API Remoting frontend") + +ggml_add_backend_library(ggml-remotingfrontend + ggml-backend-buffer.cpp + ggml-backend.cpp + ggml-backend-device.cpp + ggml-backend-reg.cpp + ggml-backend-buffer-type.cpp + ggml-backend-host-buffer-type.cpp + virtgpu.cpp + virtgpu-shm.cpp + virtgpu-utils.cpp + virtgpu-forward-device.cpp + virtgpu-forward-buffer-type.cpp + virtgpu-forward-buffer.cpp + virtgpu-forward-backend.cpp + virtgpu-forward-impl.h + ../../include/ggml-remoting-frontend.h + venus_cs_ggml-rpc-front.cpp + ) + +# dnf install -y libdrm-devel +target_link_libraries(ggml-remotingfrontend PUBLIC drm) +target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/) +target_include_directories(ggml-remotingfrontend PUBLIC ./include) + +target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp new file mode 100644 index 0000000000000..b655b8018f80d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -0,0 +1,98 @@ +#include "ggml-remoting.h" + +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +static ggml_backend_buffer_t +ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + + const int USE_FROM_PTR = true; + + if (USE_FROM_PTR) { + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_from_ptr = true; + } else { + context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->is_from_ptr = false; + context->base = NULL; + } + context->is_host_buffer = false; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + + return buffer; +} + +static const char * +ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_get_name(gpu, buft); +} + +static size_t +ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + static size_t align = 0; + + if (align == 0) { + align = apir_buffer_type_get_alignment(gpu, buft); + } + + return align; +} + +static size_t +ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + static size_t max_size = 0; + if (max_size == 0) { + max_size = apir_buffer_type_get_max_size(gpu, buft); + } + + return max_size; +} + +static bool +ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + return apir_buffer_type_is_host(gpu, buft); +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + +/****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp new file mode 100644 index 0000000000000..e720efcf47c69 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -0,0 +1,167 @@ +#include "ggml-remoting.h" + +#define BUFFER_TO_GPU(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu + +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + +struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"}; +struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"}; + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + IMPLEMENTED_ONCE; + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; + if (context->base) { + return context->base; + } + + context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), + BUFFER_TO_APIR_CONTEXT(buffer)); + + return context->base; +} + +static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + NOT_IMPLEMENTED; + + STOP_HERE; + + UNUSED(buffer); + UNUSED(tensor); + UNUSED(value); + UNUSED(offset); + UNUSED(size); +} + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_timer); + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); +#if 0 + INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size); +#endif +#if 0 + void **addr = (void **)(uintptr_t)data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy((char *)tensor->data + offset, data, size); + } else { + apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } + + stop_timer(&set_tensor_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&get_tensor_timer); + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy(data, (const char *)tensor->data + offset, size); + } else { + apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } + + stop_timer(&get_tensor_timer); +} + +static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_from_ptr_timer); + + UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); + + stop_timer(&set_tensor_from_ptr_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + UNUSED(buffer); + + start_timer(&get_tensor_from_ptr_timer); + + memcpy(data, (const char *)tensor->data + offset, size); + + stop_timer(&get_tensor_from_ptr_timer); +} + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + NOT_IMPLEMENTED; + + STOP_HERE; + + return true; + + UNUSED(buffer); + UNUSED(src); + UNUSED(dst); +} + +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value); + + return; +} + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + UNUSED(buffer); + + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); + + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + free(context); + buffer->context = NULL; +} + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp new file mode 100644 index 0000000000000..6f498d0edc2e4 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -0,0 +1,201 @@ +#include "ggml-remoting.h" + +static const char * +ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_name(gpu); +} + +static const char * +ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + IMPLEMENTED; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_description(gpu); +} + +static enum ggml_backend_dev_type +ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + struct virtgpu *gpu = DEV_TO_GPU(dev); + + static enum ggml_backend_dev_type type; + static bool has_type = false; + if (!has_type) { + has_type = true; + type = (enum ggml_backend_dev_type) apir_device_get_type(gpu); + } + + return type; +} + +static void +ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + IMPLEMENTED; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_get_memory(gpu, free, total); +} + +static bool +ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + struct virtgpu *gpu = DEV_TO_GPU(dev); + + return apir_device_supports_op(gpu, op); +} + +static bool +ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + //IMPLEMENTED_ONCE; + +#if 1 + bool supported = buft->device == dev; + if (!supported) { + //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft)); + } + + return supported; +#else + UNUSED(dev); + UNUSED(buft); + + return true; +#endif +} + +static bool +ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + //IMPLEMENTED_ONCE; + + UNUSED(dev); + UNUSED(op); + + // related to supports_buft, need to confirm + + return false; // same as ggml-metal +} + +static void +ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + IMPLEMENTED; + + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + +#if 0 + struct virtgpu *gpu = DEV_TO_GPU(dev); + apir_device_get_props(gpu, + &props->caps.async, + &props->caps.host_buffer, + &props->caps.buffer_from_host_ptr, + &props->caps.events + ); +#else + // ignore the actual backend answers and set it as we provide it in + // the API Remoting frontend + props->caps.async = false; + props->caps.host_buffer = false; + props->caps.buffer_from_host_ptr = false; + props->caps.events = false; +#endif + + INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", + __func__, props->caps.async, props->caps.host_buffer, + props->caps.buffer_from_host_ptr, props->caps.events); +} + +ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_t +ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); + context->base = ptr; + context->is_from_ptr = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); + + INFO("#"); + INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer); + INFO("#\n"); + + return buffer; +} + +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + static struct ggml_backend_buffer_type host_bufft = { + /* .iface = */ ggml_backend_remoting_host_buffer_type_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; + + return &host_bufft; +} + +const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp new file mode 100644 index 0000000000000..c09c80d6472f5 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -0,0 +1,110 @@ +#include "ggml-remoting.h" + +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; + +static void +ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + BEING_IMPLEMENTED; + + void *ptr = buffer->context; + + if (ptr == nullptr) { + return; + } + struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); + + struct vn_renderer_shmem *shmem = nullptr; + size_t index; + + for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */; + const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */; + if (ptr >= addr && ptr < endr) { + shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */; + index = i; + break; + } + } + + if (shmem == nullptr) { + WARNING("failed to free host shared memory: memory not in map\n"); + return; + } + + virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem); + + device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index); +} + +static ggml_backend_buffer_t +ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + IMPLEMENTED; + + struct virtgpu *gpu = BUFT_TO_GPU(buft); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_host_buffer = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + INFO("##"); + INFO("## %s(%llx) --> %p <======================", __func__, size, buffer); + INFO("##\n"); + + return buffer; +} + +static const char * +ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return "GUEST host buffer"; +} + +static size_t +ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return 64; // not 100% sure ... +} + +static bool +ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED_ONCE; + + return true; +} + +static size_t +ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED; + STOP_HERE; + + return SIZE_MAX; +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_host_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_remoting_host_buffer_type_is_host, + }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp new file mode 100644 index 0000000000000..d0132370d9f91 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -0,0 +1,148 @@ +#include +#include + +#include "ggml-remoting.h" + +static struct virtgpu *apir_initialize() { + static struct virtgpu *apir_gpu_instance = NULL; + static bool apir_initialized = false; + + if (apir_initialized) { + return apir_gpu_instance; + } + apir_initialized = true; + + apir_gpu_instance = create_virtgpu(); + if (!apir_gpu_instance) { + FATAL("failed to initialize the virtgpu :/"); + return NULL; + } + + apir_initialized = true; + + return apir_gpu_instance; +} + +static int ggml_backend_remoting_get_device_count() { + IMPLEMENTED; + + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + WARNING("apir_initialize failed :/"); + return 0; + } + + return apir_device_get_count(gpu); +} + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + + IMPLEMENTED; + + return ggml_backend_remoting_get_device_count(); +} + +static std::vector devices; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) { + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { + IMPLEMENTED; + + if (devices.size() > 0) { + INFO("%s: already initialized", __func__); + } + + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + FATAL("apir_initialize failed :/"); + return; + } + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + + for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) { + ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; + + ctx->device = i; + ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); + ctx->description = desc; + ctx->gpu = gpu; + + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_remoting_device_interface, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; + } + } +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + + IMPLEMENTED; + + return ggml_backend_remoting_get_device(device); +} + +static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + + return GGML_REMOTING_FRONTEND_NAME; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + + +static void showTime() { + show_timer(&graph_compute_timer); + show_timer(&get_tensor_timer); + show_timer(&set_tensor_timer); +} + +ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + FATAL("apir_initialize failed :/"); + return NULL; + } + + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ gpu, + }; + + static bool initialized = false; + if (initialized) { + return ® + } + initialized = true; + + INFO("ggml_backend_remoting_frontend_reg() hello :wave:"); + + ggml_backend_remoting_reg_init_devices(®); + + int cr = atexit(showTime); + assert(cr == 0); + + return ® +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp new file mode 100644 index 0000000000000..14f95ec88ff02 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -0,0 +1,70 @@ +#include "ggml-remoting.h" + +static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { + UNUSED(backend); + + //IMPLEMENTED_ONCE; + + return "API Remoting backend"; +} + +static void ggml_backend_remoting_free(ggml_backend_t backend) { + IMPLEMENTED; + + delete backend; +} + +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + +static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + struct virtgpu *gpu = DEV_TO_GPU(backend->device); + + IMPLEMENTED_ONCE; + + start_timer(&graph_compute_timer); + + ggml_status status = apir_backend_graph_compute(gpu, cgraph); + + stop_timer(&graph_compute_timer); + + return status; +} + +static ggml_backend_i ggml_backend_remoting_interface = { + /* .get_name = */ ggml_backend_remoting_get_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_remoting_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + + return &guid; +} + + +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + IMPLEMENTED; + + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; + + ggml_backend_t remoting_backend = new ggml_backend { + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device), + /* .context = */ ctx, + }; + + return remoting_backend; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp new file mode 100644 index 0000000000000..87679fe59a8d3 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-remoting-frontend.h" +#include "remoting.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + + + +int ggml_backend_remoting_get_device_count(); + + + + +struct remoting_device_struct { + std::mutex mutex; +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h new file mode 100644 index 0000000000000..cd58ed674475d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -0,0 +1,132 @@ +#pragma once + +#include +#include + +#include "ggml-remoting-frontend.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "virtgpu.h" + +#define DEV_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu + +#define BUFFER_TO_GGML_CONTEXT(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context) + +#define BUFFER_TO_APIR_CONTEXT(name) \ + &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context + +#define BUFFER_TO_HOST_HANDLE(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle + +#define GET_DEVICE_CONTEXT() \ + (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \ + +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft->context; +} + +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) + +#define BEING_IMPLEMENTED \ + do { \ + printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \ + } while(0) + +#define NEXT + +#define STOP_HERE \ + thks_bye() + +#define BREAKPOINT \ + breakpoint() + +#ifndef NDEBUG +#define IMPLEMENTED \ + printf("INFO: ### reached implemented function %s\n", __func__) +#else +#define IMPLEMENTED \ + do {} while(0) +#endif + +#ifndef NDEBUG +#define IMPLEMENTED_ONCE \ + do { \ + static bool first = true; \ + if (first) { \ + printf("INFO: ### reached implemented function %s\n", __func__); \ + first = false; \ + } \ + } while(0) +#else +#define IMPLEMENTED_ONCE \ + do {} while(0) +#endif + +#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl + +struct ggml_backend_remoting_device_context { + size_t device; + std::string name; + std::string description; + + std::vector> shared_memory; + + struct virtgpu *gpu; +}; + +struct ggml_backend_remoting_buffer_context { + apir_buffer_context_t apir_context; + + struct virtgpu *gpu; + + void *base; + + bool is_host_buffer; + bool is_from_ptr; +}; + +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; +extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface; + +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device); +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); + +struct remoting_buffer_struct; +typedef std::shared_ptr remoting_buffer; +typedef std::weak_ptr remoting_buffer_ref; + +void ggml_remoting_destroy_buffer(remoting_buffer& buf); + +struct remoting_device_struct; +typedef std::shared_ptr remoting_device; +typedef std::weak_ptr remoting_device_ref; + +struct remoting_context_struct { + int i; +}; +typedef std::shared_ptr remoting_context; +typedef std::weak_ptr remoting_context_ref; + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + return BUFFER_TO_HOST_HANDLE(buffer); +} diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h new file mode 100644 index 0000000000000..4e4f7c2c39e4f --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h @@ -0,0 +1,1408 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void *virt; +#else + void *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int *send_indices; /**< List of handles to buffers */ + int *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int *request_indices; /**< Buffer information */ + int *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h new file mode 100644 index 0000000000000..9debb320c34be --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h @@ -0,0 +1,276 @@ +/* + * Copyright 2013 Red Hat + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef VIRTGPU_DRM_H +#define VIRTGPU_DRM_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + * + * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel + * compatibility Keep fields aligned to their size + */ + +#define DRM_VIRTGPU_MAP 0x01 +#define DRM_VIRTGPU_EXECBUFFER 0x02 +#define DRM_VIRTGPU_GETPARAM 0x03 +#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 +#define DRM_VIRTGPU_RESOURCE_INFO 0x05 +#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 +#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 +#define DRM_VIRTGPU_WAIT 0x08 +#define DRM_VIRTGPU_GET_CAPS 0x09 +#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a +#define DRM_VIRTGPU_CONTEXT_INIT 0x0b + +#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01 +#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02 +#define VIRTGPU_EXECBUF_RING_IDX 0x04 +#define VIRTGPU_EXECBUF_FLAGS (\ + VIRTGPU_EXECBUF_FENCE_FD_IN |\ + VIRTGPU_EXECBUF_FENCE_FD_OUT |\ + VIRTGPU_EXECBUF_RING_IDX |\ + 0) + +struct drm_virtgpu_map { + __u64 offset; /* use for mmap system call */ + __u32 handle; + __u32 pad; +}; + +#define VIRTGPU_EXECBUF_SYNCOBJ_RESET 0x01 +#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \ + VIRTGPU_EXECBUF_SYNCOBJ_RESET | \ + 0) +struct drm_virtgpu_execbuffer_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */ +struct drm_virtgpu_execbuffer { + __u32 flags; + __u32 size; + __u64 command; /* void* */ + __u64 bo_handles; + __u32 num_bo_handles; + __s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */ + __u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */ + __u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */ + __u32 num_in_syncobjs; + __u32 num_out_syncobjs; + __u64 in_syncobjs; + __u64 out_syncobjs; +}; + +#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */ +#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */ +#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */ +#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */ +#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing */ +#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */ +#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */ +#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */ + +struct drm_virtgpu_getparam { + __u64 param; + __u64 value; +}; + +/* NO_BO flags? NO resource flag? */ +/* resource flag for y_0_top */ +struct drm_virtgpu_resource_create { + __u32 target; + __u32 format; + __u32 bind; + __u32 width; + __u32 height; + __u32 depth; + __u32 array_size; + __u32 last_level; + __u32 nr_samples; + __u32 flags; + __u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */ + __u32 res_handle; /* returned by kernel */ + __u32 size; /* validate transfer in the host */ + __u32 stride; /* validate transfer in the host */ +}; + +struct drm_virtgpu_resource_info { + __u32 bo_handle; + __u32 res_handle; + __u32 size; + __u32 blob_mem; +}; + +struct drm_virtgpu_3d_box { + __u32 x; + __u32 y; + __u32 z; + __u32 w; + __u32 h; + __u32 d; +}; + +struct drm_virtgpu_3d_transfer_to_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +struct drm_virtgpu_3d_transfer_from_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +#define VIRTGPU_WAIT_NOWAIT 1 /* like it */ +struct drm_virtgpu_3d_wait { + __u32 handle; /* 0 is an invalid handle */ + __u32 flags; +}; + +#define VIRTGPU_DRM_CAPSET_VIRGL 1 +#define VIRTGPU_DRM_CAPSET_VIRGL2 2 +#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3 +#define VIRTGPU_DRM_CAPSET_VENUS 4 +#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5 +#define VIRTGPU_DRM_CAPSET_DRM 6 +struct drm_virtgpu_get_caps { + __u32 cap_set_id; + __u32 cap_set_ver; + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct drm_virtgpu_resource_create_blob { +#define VIRTGPU_BLOB_MEM_GUEST 0x0001 +#define VIRTGPU_BLOB_MEM_HOST3D 0x0002 +#define VIRTGPU_BLOB_MEM_HOST3D_GUEST 0x0003 + +#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE 0x0001 +#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE 0x0002 +#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004 + /* zero is invalid blob_mem */ + __u32 blob_mem; + __u32 blob_flags; + __u32 bo_handle; + __u32 res_handle; + __u64 size; + + /* + * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and + * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero. + */ + __u32 pad; + __u32 cmd_size; + __u64 cmd; + __u64 blob_id; +}; + +#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID 0x0001 +#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS 0x0002 +#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003 +#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME 0x0004 +struct drm_virtgpu_context_set_param { + __u64 param; + __u64 value; +}; + +struct drm_virtgpu_context_init { + __u32 num_params; + __u32 pad; + + /* pointer to drm_virtgpu_context_set_param array */ + __u64 ctx_set_params; +}; + +/* + * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in + * effect. The event size is sizeof(drm_event), since there is no additional + * payload. + */ +#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000 + +#define DRM_IOCTL_VIRTGPU_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) + +#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\ + struct drm_virtgpu_execbuffer) + +#define DRM_IOCTL_VIRTGPU_GETPARAM \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\ + struct drm_virtgpu_getparam) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ + struct drm_virtgpu_resource_create) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \ + struct drm_virtgpu_resource_info) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ + struct drm_virtgpu_3d_transfer_from_host) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ + struct drm_virtgpu_3d_transfer_to_host) + +#define DRM_IOCTL_VIRTGPU_WAIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ + struct drm_virtgpu_3d_wait) + +#define DRM_IOCTL_VIRTGPU_GET_CAPS \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \ + struct drm_virtgpu_get_caps) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB, \ + struct drm_virtgpu_resource_create_blob) + +#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT, \ + struct drm_virtgpu_context_init) + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h new file mode 100644 index 0000000000000..3ef774b8259d3 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h @@ -0,0 +1,74 @@ +/* + * Copyright 2020 Chromium + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef VENUS_HW_H +#define VENUS_HW_H + +#include + +struct virgl_renderer_capset_venus { + uint32_t wire_format_version; + uint32_t vk_xml_version; + uint32_t vk_ext_command_serialization_spec_version; + uint32_t vk_mesa_venus_protocol_spec_version; + + /* This flag indicates render server config, and will be needed until drm + * virtio-gpu blob mem gets fixed to attach_resource before resource_map. + */ + uint32_t supports_blob_id_0; + + /* Extension number N, where N is defined by the Vulkan spec, corresponds + * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023 + * Vulkan extensions (numbered from 1 to 1023). + * + * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When + * that bit is set, the extension mask(s) are valid. Otherwise, all the + * extensions are assumed to be supported by the renderer side protocol. + */ + uint32_t vk_extension_mask1[32]; + + /* The single-threaded renderer cannot afford potential blocking calls. It + * also leads to GPU lost if the wait depends on a following command. This + * capset allows such blocking calls to passthrough from the clients, and + * shifts the responsibilities to the client drivers. + */ + uint32_t allow_vk_wait_syncs; + + /* This flag indicates that the renderer supports multiple fencing + * timelines. The client driver is expected to associate each VkQueue with + * one of these timelines at queue creation by binding it with an unused + * ring_idx. Queues created without a ring_idx binding are associated to a + * shared legacy timeline. The special ring_idx==0 is reserved for CPU + * fences that are signaled by the renderer immediately upon consumption of + * the associated renderer submission. + */ + uint32_t supports_multiple_timelines; + + /* This flag indicates to the guest that hypervisor does not support memory + * pages injections and blob allocations must be done by guest from the + * dedicated heap (Host visible memory). + */ + uint32_t use_guest_vram; +}; + +#endif /* VENUS_HW_H */ diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp new file mode 100644 index 0000000000000..7ce0dbb7fbc67 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h" + +#include "ggml-remoting.h" + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HOST_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + if (tensor->data) { + if (!tensor->buffer) { + FATAL("tensor has data but not buffer :/"); + } + // tensor->data is serialized as an offset to the buffer base address + result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); + } + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp new file mode 100644 index 0000000000000..82b51838997c6 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -0,0 +1,53 @@ +#include "virtgpu-forward-impl.h" + +static long long current_time_ms() { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +ggml_status +apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); + + std::vector cgraph_data; + size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data); + + struct vn_renderer_shmem *shmem; + if (cgraph_size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, cgraph_size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + //INFO("Send shmem ID %d", shmem->res_id); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + //INFO("Send shmem size %lu", cgraph_size); + vn_encode_size_t(encoder, &cgraph_size); + + char *shmem_data = (char *) shmem->mmap_ptr; + struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size); + + vn_encode_cgraph_data(&secondary_enc, cgraph_data); + + REMOTE_CALL(gpu, encoder, decoder); + + ggml_status status = GGML_STATUS_ABORTED; + vn_decode_ggml_status(decoder, &status); + //INFO("Received status %u", status); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } + + return status; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp new file mode 100644 index 0000000000000..e991c0bef324d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -0,0 +1,114 @@ +#include "virtgpu-forward-impl.h" + +const char * +apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + //INFO("%s: Forward BUFT NAME --> %s", __func__, string); + + /* *** */ + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return string; +} + +size_t +apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder); + + size_t alignment; + vn_decode_size_t(decoder, &alignment); + + INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return alignment; +} + +size_t +apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder); + + size_t max_size; + vn_decode_size_t(decoder, &max_size); + + INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return max_size; +} + +bool +apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); + + vn_encode_ggml_buffer_type(encoder, buft); + + REMOTE_CALL(gpu, encoder, decoder); + + bool is_host; + vn_decode_bool_t(decoder, &is_host); + + INFO("%s: buffer is host? %d", __func__, is_host); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return is_host; +} + +apir_buffer_context_t +apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + apir_buffer_context_t buffer_context; + INFO("%s: allocate device memory (%lu)", __func__, size); + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); + + vn_encode_ggml_buffer_type(encoder, buft); + + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp new file mode 100644 index 0000000000000..04041ab5feb37 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -0,0 +1,145 @@ +#include "virtgpu-forward-impl.h" + +void * +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder); + + uintptr_t base; + vn_decode_uintptr_t(decoder, &base); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + //INFO("%s: received base %p\n", __func__, (void *) base); + + return (void *) base; +} + +void +apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + ggml_tensor *tensor, const void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + +#if 0 + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer_context->host_handle, tensor, data, offset, size); +#endif + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_ggml_tensor(encoder, tensor); + + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + memcpy(shmem->mmap_ptr, data, size); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } + + return; +} + +#if false +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + UNUSED(gpu); + UNUSED(tensor); + char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr; + + memcpy(data, buffer_base_addr+offset, size); +} +#else +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_ggml_tensor(encoder, tensor); + + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; + } + + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + memcpy(data, shmem->mmap_ptr, size); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } +} +#endif + +void +apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + uint8_t value) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + vn_encode_uint8_t(encoder, &value); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); +} + + +void +apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER); + + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp new file mode 100644 index 0000000000000..ca036366a6752 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -0,0 +1,236 @@ +#include "virtgpu-forward-impl.h" + +int +apir_device_get_count(struct virtgpu *gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return dev_count; + } + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT); + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_int32_t(decoder, &dev_count); + + INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return dev_count; +} + +const char * +apir_device_get_name(struct virtgpu *gpu) { + static char *string = nullptr; + if (string) { + CACHED; + return string; + } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME); + REMOTE_CALL(gpu, encoder, decoder); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward DEV NAME --> %s", __func__, string); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return string; +} + +const char * +apir_device_get_description(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION); + + REMOTE_CALL(gpu, encoder, decoder); + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device description buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward DEV DESCR --> %s", __func__, string); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return string; +} + +uint32_t +apir_device_get_type(struct virtgpu *gpu) { + static uint32_t dev_type = 255; + if (dev_type != 255) { + CACHED; + return dev_type; + } + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_uint32_t(decoder, &dev_type); + + INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return dev_type; +} + +void +apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { + static size_t dev_free = 0; + static size_t dev_total = 0; + /* + if (dev_total != 0) { + WARNING("Not sure if llama.cpp expects fresh information for the free memory ..."); + *free = dev_free; + *total = dev_total; + + CACHED; + return; + } + */ + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_size_t(decoder, &dev_free); + vn_decode_size_t(decoder, &dev_total); + + *free = dev_free; + *total = dev_total; + + INFO("%s: Forward DEV FREE mem --> %zu MB", __func__, dev_free / 1024 / 1024); + INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024); + + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return; +} + +bool +apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { +#if APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE + /* ggml-rpc cheats it like this */ + /* with the current implementation of serialize_tensor, the src/view aren't properly passed */ + UNUSED(gpu); + UNUSED(op); + + return true; +#else + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); + + vn_encode_ggml_tensor_inline(encoder, op); + + REMOTE_CALL(gpu, encoder, decoder); + + bool supports_op; + vn_decode_bool_t(decoder, &supports_op); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return supports_op; +#endif +} + +apir_buffer_type_host_handle_t +apir_device_get_buffer_type(struct virtgpu *gpu) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE); + + REMOTE_CALL(gpu, encoder, decoder); + + apir_buffer_type_host_handle_t buft_handle; + vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return buft_handle; +} + +void +apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_bool_t(decoder, async); + vn_decode_bool_t(decoder, host_buffer); + vn_decode_bool_t(decoder, buffer_from_host_ptr); + vn_decode_bool_t(decoder, events); + + /* *** */ + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return; +} + +apir_buffer_context_t +apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + apir_buffer_context_t buffer_context; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); + + /* *** */ + + buffer_context.shmem = virtgpu_shmem_create(gpu, size); + if (!buffer_context.shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + + vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); + + vn_encode_size_t(encoder, &size); + vn_encode_size_t(encoder, &max_tensor_size); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); + + /* *** */ + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h new file mode 100644 index 0000000000000..26510b20bc479 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h @@ -0,0 +1,34 @@ +#include "ggml-backend-impl.h" +#include "ggml-remoting.h" +#include "virtgpu.h" +#include "../ggml-remotingbackend/shared/apir_backend.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml.h" + +#define CACHED +// printf("INFO: ### found response in the cache %s\n", __func__)o + + +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + do { \ + int32_t forward_flag = (int32_t) apir_command_type__; \ + encoder_name = remote_call_prepare(gpu_dev_name, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); \ + if (!encoder) { \ + FATAL("%s: failed to prepare the remote call encoder :/", __func__); \ + } \ + } while(0) + +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name) \ + do { \ + decoder_name = remote_call(gpu_dev_name, encoder_name); \ + if (!decoder) { \ + FATAL("%s: failed to kick the remote call :/", __func__); \ + } \ + } while(0) + +#define REMOTE_CALL_FINISH(gpu_dev_name, encoder_name, decoder_name) \ + do { \ + int32_t ret = remote_call_finish(encoder_name, decoder_name); \ + if (ret != 0) { \ + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); \ + } \ + } while(0) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h new file mode 100644 index 0000000000000..cc159e071e218 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -0,0 +1,46 @@ +#include "ggml.h" +#include "ggml-impl.h" +#include "ggml-alloc.h" + +#include "virtgpu-utils.h" + +#include "../ggml-remotingbackend/shared/apir_backend.h" + +/* device */ +int apir_device_get_count(struct virtgpu *gpu); +const char *apir_device_get_name(struct virtgpu *gpu); +const char *apir_device_get_description(struct virtgpu *gpu); +uint32_t apir_device_get_type(struct virtgpu *gpu); +void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); +bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); +apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu); +void apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events); +apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size); +/* buffer-type */ +const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size); + +/* buffer */ + +void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); +enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor); +void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + ggml_tensor *tensor, const void *data, size_t offset, size_t size); +void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size); +void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + uint8_t value); +void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); + +/* backend */ + +ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp new file mode 100644 index 0000000000000..a09fd22371a8c --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -0,0 +1,111 @@ +#include + +#include "virtgpu-shm.h" + +static uint32_t +virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu, + uint32_t blob_mem, + uint32_t blob_flags, + size_t blob_size, + uint64_t blob_id, + uint32_t *res_id) +{ +#ifdef SIMULATE_BO_SIZE_FIX + blob_size = align64(blob_size, 4096); +#endif + + struct drm_virtgpu_resource_create_blob args = { + .blob_mem = blob_mem, + .blob_flags = blob_flags, + .bo_handle = 0, + .res_handle = 0, + .size = blob_size, + .pad = 0, + .cmd_size = 0, + .cmd = 0, + .blob_id = blob_id, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) + return 0; + + *res_id = args.res_handle; + return args.bo_handle; +} + +static void +virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle) +{ + struct drm_gem_close args = { + .handle = gem_handle, + .pad = 0, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args); + assert(!ret); +#ifdef NDEBUG + UNUSED(ret); +#endif +} + +static void * +virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) +{ + struct drm_virtgpu_map args = { + .offset = 0, + .handle = gem_handle, + .pad = 0, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) + return NULL; + + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, + args.offset); + if (ptr == MAP_FAILED) + return NULL; + + return ptr; +} + +void +virtgpu_shmem_destroy(struct virtgpu *gpu, + struct virtgpu_shmem *shmem) +{ + munmap(shmem->base.mmap_ptr, shmem->base.mmap_size); + virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); +} + +struct vn_renderer_shmem * +virtgpu_shmem_create(struct virtgpu *gpu, size_t size) +{ + size = align64(size, 16384); + + uint32_t res_id; + uint32_t gem_handle = virtgpu_ioctl_resource_create_blob( + gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, + &res_id); + if (!gem_handle) + return NULL; + + void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size); + if (!ptr) { + virtgpu_ioctl_gem_close(gpu, gem_handle); + return NULL; + } + if (gpu->shmem_array.elem_size == 0) { + INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n"); + assert(false); + } + struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle); + + shmem->gem_handle = gem_handle; + shmem->base.res_id = res_id; + shmem->base.mmap_size = size; + shmem->base.mmap_ptr = ptr; + shmem->base.refcount.count = 1; + shmem->base.gem_handle = gem_handle; + shmem->base.shmem = shmem; + + return &shmem->base; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h new file mode 100644 index 0000000000000..52217f5b7e857 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "virtgpu.h" +#include "virtgpu-utils.h" + +struct vn_refcount { + int count; //atomic_int +}; + + +struct vn_renderer_shmem { + struct vn_refcount refcount; + + uint32_t res_id; + size_t mmap_size; /* for internal use only (i.e., munmap) */ + void *mmap_ptr; + + struct list_head cache_head; + int64_t cache_timestamp; + + uint32_t gem_handle; + + struct virtgpu_shmem *shmem; +}; + +struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size); +void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem); + + +struct virtgpu_shmem { + struct vn_renderer_shmem base; + uint32_t gem_handle; +}; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp new file mode 100644 index 0000000000000..833f0e4680103 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -0,0 +1,200 @@ +#include "virtgpu-utils.h" +#include +#include +#include + +#define NODE_ALLOC_ALIGN 64 +#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1)) +#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1) +#define NULL_NODE 0 + +#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) +#define os_free_aligned(_ptr) free(_ptr) +#define p_atomic_cmpxchg(v, old, _new) \ + __sync_val_compare_and_swap((v), (old), (_new)) + +static inline uint64_t +util_logbase2_64(uint64_t n) +{ +#if defined(HAVE___BUILTIN_CLZLL) + return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1)); +#else + uint64_t pos = 0ull; + if (n >= 1ull<<32) { n >>= 32; pos += 32; } + if (n >= 1ull<<16) { n >>= 16; pos += 16; } + if (n >= 1ull<< 8) { n >>= 8; pos += 8; } + if (n >= 1ull<< 4) { n >>= 4; pos += 4; } + if (n >= 1ull<< 2) { n >>= 2; pos += 2; } + if (n >= 1ull<< 1) { pos += 1; } + return pos; +#endif +} + +void +util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size) +{ + memset(arr, 0, sizeof(*arr)); + arr->elem_size = elem_size; + arr->node_size_log2 = util_logbase2_64(node_size); + assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2)); +} + +static inline void * +os_malloc_aligned(size_t size, size_t alignment) +{ + void *ptr; + alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + if(posix_memalign(&ptr, alignment, size) != 0) + return NULL; + return ptr; +} + +static inline void * +_util_sparse_array_node_data(uintptr_t handle) +{ + return (void *)(handle & NODE_PTR_MASK); +} + +static inline unsigned +_util_sparse_array_node_level(uintptr_t handle) +{ + return handle & NODE_LEVEL_MASK; +} + +static inline void +_util_sparse_array_node_finish(struct util_sparse_array *arr, + uintptr_t node) +{ + if (_util_sparse_array_node_level(node) > 0) { + uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node); + size_t node_size = 1ull << arr->node_size_log2; + for (size_t i = 0; i < node_size; i++) { + if (children[i]) + _util_sparse_array_node_finish(arr, children[i]); + } + } + + os_free_aligned(_util_sparse_array_node_data(node)); +} + +static inline uintptr_t +_util_sparse_array_node(void *data, unsigned level) +{ + assert(data != NULL); + assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0); + assert((level & NODE_PTR_MASK) == 0); + return (uintptr_t)data | level; +} + +inline uintptr_t +_util_sparse_array_node_alloc(struct util_sparse_array *arr, + unsigned level) +{ + size_t size; + if (level == 0) { + size = arr->elem_size << arr->node_size_log2; + } else { + size = sizeof(uintptr_t) << arr->node_size_log2; + } + + void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN); + memset(data, 0, size); + + return _util_sparse_array_node(data, level); +} + +static inline uintptr_t +_util_sparse_array_set_or_free_node(uintptr_t *node_ptr, + uintptr_t cmp_node, + uintptr_t node) +{ + uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node); + + if (prev_node != cmp_node) { + /* We lost the race. Free this one and return the one that was already + * allocated. + */ + os_free_aligned(_util_sparse_array_node_data(node)); + return prev_node; + } else { + return node; + } +} + +void * +util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx) +{ + const unsigned node_size_log2 = arr->node_size_log2; + uintptr_t root = p_atomic_read(&arr->root); + if (unlikely(!root)) { + unsigned root_level = 0; + uint64_t idx_iter = idx >> node_size_log2; + while (idx_iter) { + idx_iter >>= node_size_log2; + root_level++; + } + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level); + root = _util_sparse_array_set_or_free_node(&arr->root, + NULL_NODE, new_root); + } + + while (1) { + unsigned root_level = _util_sparse_array_node_level(root); + uint64_t root_idx = idx >> (root_level * node_size_log2); + if (likely(root_idx < (1ull << node_size_log2))) + break; + + /* In this case, we have a root but its level is low enough that the + * requested index is out-of-bounds. + */ + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1); + + uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root); + new_root_children[0] = root; + + /* We only add one at a time instead of the whole tree because it's + * easier to ensure correctness of both the tree building and the + * clean-up path. Because we're only adding one node we never have to + * worry about trying to free multiple things without freeing the old + * things. + */ + root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root); + } + + void *node_data = _util_sparse_array_node_data(root); + unsigned node_level = _util_sparse_array_node_level(root); + while (node_level > 0) { + uint64_t child_idx = (idx >> (node_level * node_size_log2)) & + ((1ull << node_size_log2) - 1); + + uintptr_t *children = (uintptr_t *) node_data; + uintptr_t child = p_atomic_read(&children[child_idx]); + + if (unlikely(!child)) { + child = _util_sparse_array_node_alloc(arr, node_level - 1); + child = _util_sparse_array_set_or_free_node(&children[child_idx], + NULL_NODE, child); + } + + node_data = _util_sparse_array_node_data(child); + node_level = _util_sparse_array_node_level(child); + } + + uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1); + return (void *)((char *)node_data + (elem_idx * arr->elem_size)); +} + +void *something = NULL; +void thks_bye () { + // break here + INFO("thks bye, stopping early and happilly :)"); + if (!something) { // avoid the [[noreturn]] detection mechanism + exit(0); + } +} + +void breakpoint() { + // break here + INFO("breakpoint here :)"); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h new file mode 100644 index 0000000000000..6b69ebc6329ca --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -0,0 +1,108 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) + +#ifndef UNUSED +#define UNUSED(x) (void)(x) +#endif + +/** Checks is a value is a power of two. Does not handle zero. */ +#define IS_POT(v) (((v) & ((v) - 1)) == 0) + +/** Checks is a value is a power of two. Zero handled. */ +#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v)) + +/** Align a value to a power of two */ +#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1)) + +#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) + +void thks_bye(); +void breakpoint(); + +#ifndef NDEBUG +inline void +INFO(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} +#else +inline void +INFO(...) {} +#endif + +inline void +WARNING(const char *format, ...) { + fprintf(stderr, "WARNING: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +FATAL(const char *format, ...) { + fprintf(stderr, "FATAL: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); + assert(false); +} + +static inline bool +util_is_power_of_two_nonzero64(uint64_t v) +{ + return IS_POT_NONZERO(v); +} + +static inline uint64_t +align64(uint64_t value, uint64_t alignment) +{ + assert(util_is_power_of_two_nonzero64(alignment)); + return ALIGN_POT(value, alignment); +} + +struct list_head +{ + struct list_head *prev; + struct list_head *next; +}; + +struct util_sparse_array { + size_t elem_size; + unsigned node_size_log2; + + uintptr_t root; +}; + +void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx); +void util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size); + +inline void +os_time_sleep(int64_t usecs) +{ + struct timespec time; + time.tv_sec = usecs / 1000000; + time.tv_nsec = (usecs % 1000000) * 1000; + while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp new file mode 100644 index 0000000000000..66bbf17ac6d63 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -0,0 +1,497 @@ +#include +#include +#include +#include + +#include + +#include "virtgpu.h" + +static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); +static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu); + + +static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu); + +static int virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id); +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size); +static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); +static void virtgpu_init_renderer_info(struct virtgpu *gpu); + +static inline void +virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) +{ + /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory. They are + * logically contiguous in the guest but are sglists (iovecs) in the host. + * That makes them slower to process in the host. With host process + * isolation, it also becomes impossible for the host to access sglists + * directly. + * + * While there are ideas (and shipped code in some cases) such as creating + * udmabufs from sglists, or having a dedicated guest heap, it seems the + * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D. That is, when the + * renderer sees a request to export a blob where + * + * - blob_mem is VIRTGPU_BLOB_MEM_HOST3D + * - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE + * - blob_id is 0 + * + * it allocates a host shmem. + * + * supports_blob_id_0 has been enforced by mandated render server config. + */ + assert(gpu->capset.data.supports_blob_id_0); + gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; +} + +struct virtgpu * +create_virtgpu() { + struct virtgpu *gpu = new struct virtgpu(); + + util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem), + 1024); + + virt_gpu_result_t result = virtgpu_open(gpu); + assert(result == APIR_SUCCESS); + + result = virtgpu_init_params(gpu); + assert(result == APIR_SUCCESS); + + result = virtgpu_init_capset(gpu); + assert(result == APIR_SUCCESS); + + result = virtgpu_init_context(gpu); + assert(result == APIR_SUCCESS); + +#ifdef NDEBUG + UNUSED(result); +#endif + + virtgpu_init_shmem_blob_mem(gpu); + + gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000); + gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB + + if (!gpu->reply_shmem) { + FATAL("%s: failed to create the shared reply memory pages :/", __func__); + } + + if (!gpu->data_shmem) { + FATAL("%s: failed to create the shared data memory pages :/", __func__); + } + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + int32_t ret; + + encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_LoadLibrary, 0); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret); + } + + return gpu; +} + + +static virt_gpu_result_t +virtgpu_open(struct virtgpu *gpu) +{ + drmDevicePtr devs[8]; + int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); + if (count < 0) { + INFO("failed to enumerate DRM devices"); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED; + for (int i = 0; i < count; i++) { + result = virtgpu_open_device(gpu, devs[i]); + if (result == APIR_SUCCESS) + break; + } + + drmFreeDevices(devs, count); + + return result; +} + +static virt_gpu_result_t +virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) +{ + bool supported_bus = false; + + switch (dev->bustype) { + case DRM_BUS_PCI: + if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID && + dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID) + supported_bus = true; + break; + case DRM_BUS_PLATFORM: + supported_bus = true; + break; + default: + break; + } + + if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) { + if (VN_DEBUG(INIT)) { + const char *name = "unknown"; + for (uint32_t i = 0; i < DRM_NODE_MAX; i++) { + if (dev->available_nodes & (1 << i)) { + name = dev->nodes[i]; + break; + } + } + vn_log(gpu->instance, "skipping DRM device %s", name); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + const char *primary_path = dev->nodes[DRM_NODE_PRIMARY]; + const char *node_path = dev->nodes[DRM_NODE_RENDER]; + + int fd = open(node_path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + if (VN_DEBUG(INIT)) + vn_log(gpu->instance, "failed to open %s", node_path); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + drmVersionPtr version = drmGetVersion(fd); + if (!version || strcmp(version->name, "virtio_gpu") || + version->version_major != 0) { + if (VN_DEBUG(INIT)) { + if (version) { + vn_log(gpu->instance, "unknown DRM driver %s version %d", + version->name, version->version_major); + } else { + vn_log(gpu->instance, "failed to get DRM driver version"); + } + } + if (version) + drmFreeVersion(version); + close(fd); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + gpu->fd = fd; + + struct stat st; + if (stat(primary_path, &st) == 0) { + gpu->has_primary = true; + gpu->primary_major = major(st.st_rdev); + gpu->primary_minor = minor(st.st_rdev); + } else { + gpu->has_primary = false; + gpu->primary_major = 0; + gpu->primary_minor = 0; + } + stat(node_path, &st); + gpu->render_major = major(st.st_rdev); + gpu->render_minor = minor(st.st_rdev); + + gpu->bustype = dev->bustype; + if (dev->bustype == DRM_BUS_PCI) + gpu->pci_bus_info = *dev->businfo.pci; + + drmFreeVersion(version); + + INFO("using DRM device %s", node_path); + + return APIR_SUCCESS; +} + +void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) +{ + if (instance) { + printf(""); + } + + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + + /* instance may be NULL or partially initialized */ +} + +static virt_gpu_result_t +virtgpu_init_context(struct virtgpu *gpu) +{ + assert(!gpu->capset.version); + const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to initialize context: %s", + strerror(errno)); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + return APIR_SUCCESS; +} + +static virt_gpu_result_t +virtgpu_init_capset(struct virtgpu *gpu) +{ + gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS; + gpu->capset.version = 0; + + const int ret = + virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, + &gpu->capset.data, sizeof(gpu->capset.data)); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to get venus v%d capset: %s", + gpu->capset.version, strerror(errno)); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + + return APIR_SUCCESS; +} + +static virt_gpu_result_t +virtgpu_init_params(struct virtgpu *gpu) +{ + const uint64_t required_params[] = { + VIRTGPU_PARAM_3D_FEATURES, VIRTGPU_PARAM_CAPSET_QUERY_FIX, + VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT, + }; + uint64_t val; + for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) { + val = virtgpu_ioctl_getparam(gpu, required_params[i]); + if (!val) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "required kernel param %d is missing", + (int)required_params[i]); + } + return APIR_ERROR_INITIALIZATION_FAILED; + } + } + + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; + } else { + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM; + } + } + + if (!val) { + vn_log(gpu->instance, + "one of required kernel params (%d or %d) is missing", + (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM); + return APIR_ERROR_INITIALIZATION_FAILED; + } + + /* Cross-device feature is optional. It enables sharing dma-bufs + * with other virtio devices, like virtio-wl or virtio-video used + * by ChromeOS VMs. Qemu doesn't support cross-device sharing. + */ + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE); + if (val) + gpu->supports_cross_device = true; + + /* implied by CONTEXT_INIT uapi */ + gpu->max_timeline_count = 64; + + return APIR_SUCCESS; +} + +static int +virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id) +{ + struct drm_virtgpu_context_set_param ctx_set_params[3] = { + { + .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID, + .value = capset_id, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS, + .value = 64, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK, + .value = 0, /* don't generate drm_events on fence signaling */ + }, + }; + + struct drm_virtgpu_context_init args = { + .num_params = ARRAY_SIZE(ctx_set_params), + .pad = 0, + .ctx_set_params = (uintptr_t)&ctx_set_params, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args); +} + +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size) +{ + struct drm_virtgpu_get_caps args = { + .cap_set_id = id, + .cap_set_ver = version, + .addr = (uintptr_t)capset, + .size = (__u32) capset_size, + .pad = 0, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args); +} + +static uint64_t +virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) +{ + /* val must be zeroed because kernel only writes the lower 32 bits */ + uint64_t val = 0; + struct drm_virtgpu_getparam args = { + .param = param, + .value = (uintptr_t)&val, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args); + return ret ? 0 : val; +} + + +struct vn_cs_encoder * +remote_call_prepare( + struct virtgpu *gpu, + int32_t cmd_type, + int32_t cmd_flags) +{ + + if (!gpu->reply_shmem) { + FATAL("%s: the reply shmem page can't be null", __func__); + } + + /* + * Prepare the command encoder and its buffer + */ + + static char encoder_buffer[4096]; + + static struct vn_cs_encoder enc; + enc = { + encoder_buffer, + encoder_buffer, + encoder_buffer + sizeof(encoder_buffer), + }; + + /* + * Fill the command encoder with the common args: + * - cmd_type (int32_t) + * - cmd_flags (int32_t) + * - reply res id (uint32_t) + */ + + vn_encode_int32_t(&enc, &cmd_type); + vn_encode_int32_t(&enc, &cmd_flags); + + uint32_t reply_res_id = gpu->reply_shmem->res_id; + vn_encode_uint32_t(&enc, &reply_res_id); + + return &enc; +} + +int32_t +remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + if (!enc) { + WARNING("Invalid (null) encoder :/"); + } + if (!dec) { + FATAL("Invalid (null) decoder :/"); + } + int32_t remote_call_ret; + vn_decode_int32_t(dec, &remote_call_ret); + + // encoder and decoder are statically allocated, nothing to do to release them + + return remote_call_ret; +} + +struct vn_cs_decoder * +remote_call( + struct virtgpu *gpu, + struct vn_cs_encoder *encoder + ) +{ + /* + * Prepare the reply notification pointer + */ + + volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr; + *atomic_reply_notif = 0; + + /* + * Trigger the execbuf ioctl + */ + + struct drm_virtgpu_execbuffer args = { + .flags = VIRTGPU_EXECBUF_RING_IDX, + .size = (uint32_t) (encoder->cur - encoder->start), + .command = (uintptr_t) encoder->start, + + .bo_handles = 0, + .num_bo_handles = 0, + + .fence_fd = 0, + .ring_idx = 0, + .syncobj_stride = 0, + .num_in_syncobjs = 0, + .num_out_syncobjs = 0, + .in_syncobjs = 0, + .out_syncobjs = 0, + }; + + int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); + + if (ret != 0) { + FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret); + } + /* + * Wait for the response notification + */ + + while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) { + int64_t base_sleep_us = 15; + + os_time_sleep(base_sleep_us); + } + + /* + * Prepare the decoder + */ + static struct vn_cs_decoder dec; + dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif); + dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size; + + return &dec; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h new file mode 100644 index 0000000000000..9d8668c3d070e --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virtgpu-forward.h" +#include "virtgpu-utils.h" +#include "../ggml-remotingbackend/shared/api_remoting.h" +#include "../ggml-remotingbackend/shared/venus_cs.h" + +#include "virtgpu-shm.h" + +#define VIRGL_RENDERER_UNSTABLE_APIS 1 +#include "drm-uapi/virtgpu_drm.h" +#include "venus_hw.h" + +// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads +enum virgl_renderer_capset { + VIRGL_RENDERER_CAPSET_VIRGL = 1, + VIRGL_RENDERER_CAPSET_VIRGL2 = 2, + /* 3 is reserved for gfxstream */ + VIRGL_RENDERER_CAPSET_VENUS = 4, + /* 5 is reserved for cross-domain */ + VIRGL_RENDERER_CAPSET_DRM = 6, +}; + +/* from src/virtio/vulkan/vn_renderer_virtgpu.c */ +#define VIRTGPU_PCI_VENDOR_ID 0x1af4 +#define VIRTGPU_PCI_DEVICE_ID 0x1050 +#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004 +#define VIRTGPU_PARAM_GUEST_VRAM 9 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define VN_DEBUG(what) true + +typedef enum virt_gpu_result_t { + APIR_SUCCESS = 0, + APIR_ERROR_INITIALIZATION_FAILED = -1, +} virt_gpu_result_t; + + +struct remoting_dev_instance { + int yes; +}; + +#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a))) + +inline void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) + PRINTFLIKE(2, 3); + + +struct virtgpu { + struct remoting_dev_instance *instance; + + int fd; + + bool has_primary; + int primary_major; + int primary_minor; + int render_major; + int render_minor; + + int bustype; + drmPciBusInfo pci_bus_info; + + uint32_t max_timeline_count; + + struct { + enum virgl_renderer_capset id; + uint32_t version; + struct virgl_renderer_capset_venus data; + } capset; + + uint32_t shmem_blob_mem; + uint32_t bo_blob_mem; + + /* note that we use gem_handle instead of res_id to index because + * res_id is monotonically increasing by default (see + * virtio_gpu_resource_id_get) + */ + struct util_sparse_array shmem_array; + // struct util_sparse_array bo_array; + + mtx_t dma_buf_import_mutex; + + // struct virtgpu_shmem_cache shmem_cache; + + bool supports_cross_device; + + /* KP */ + struct vn_renderer_shmem *reply_shmem; + struct vn_renderer_shmem *data_shmem; +}; + + +static inline int +virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) +{ + return drmIoctl(gpu->fd, request, args); +} + +struct virtgpu *create_virtgpu(); + +struct vn_cs_encoder *remote_call_prepare( + struct virtgpu *gpu, + int32_t cmd_type, + int32_t cmd_flags); +struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc); +int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); diff --git a/podman_compile.sh b/podman_compile.sh new file mode 100755 index 0000000000000..de9e5c88d57a7 --- /dev/null +++ b/podman_compile.sh @@ -0,0 +1,39 @@ +#! /bin/bash + + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +opts="" +opts="$opts --device /dev/dri " +echo "Running with the GPU passthrough" + +IMAGE=quay.io/ramalama/remoting:latest + +what=${1:-} +if [[ -z "$what" ]]; then + what=remoting +fi + +cmd="bash ./build.$what.sh" + +POD_NAME=mac_ai_compiling +podman machine ssh podman rm $POD_NAME --force + +set -x +podman run \ +--name $POD_NAME \ +--user root:root \ +--cgroupns host \ +--security-opt label=disable \ +--env HOME="$HOME" \ +--env PERF_MODE="${PERF_MODE:-}" \ +--env BENCH_MODE="${BENCH_MODE:-}" \ +-v "$HOME":"$HOME":Z \ +-w "$PWD" \ +-it --rm \ +$opts \ +$IMAGE \ +$cmd diff --git a/prepare.backend.sh b/prepare.backend.sh new file mode 100755 index 0000000000000..93373ec5b7a5c --- /dev/null +++ b/prepare.backend.sh @@ -0,0 +1,11 @@ +cmake -S . -B ../build.remoting-backend \ + -DGGML_REMOTINGBACKEND=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_METAL=ON \ + -DGGML_BACKEND_DL=OFF \ + -DLLAMA_CURL=OFF \ + -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \ + "$@" + +# -DCMAKE_BUILD_TYPE=Debug \ +# diff --git a/prepare.remoting.sh b/prepare.remoting.sh new file mode 100755 index 0000000000000..5ab73470477b1 --- /dev/null +++ b/prepare.remoting.sh @@ -0,0 +1,8 @@ +cmake -S . -B ../build.remoting-frontend \ + -DGGML_REMOTINGFRONTEND=ON \ + -DGGML_CPU_ARM_ARCH=native \ + -DGGML_NATIVE=OFF \ + -DGGML_OPENMP=OFF \ + -DLLAMA_CURL=OFF \ + -DCMAKE_BUILD_TYPE=Debug \ + "$@" diff --git a/prepare.sh b/prepare.sh new file mode 100644 index 0000000000000..2fb46cefd426c --- /dev/null +++ b/prepare.sh @@ -0,0 +1 @@ +cmake -S . -B ./build -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF #-DCMAKE_BUILD_TYPE=Debug #-DGGML_VULKAN_DEBUG=1 diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh new file mode 100644 index 0000000000000..7bacf9b21a9ca --- /dev/null +++ b/prepare.vulkan.sh @@ -0,0 +1,6 @@ +cmake -S . \ + -B ../build.vulkan \ + -DGGML_VULKAN=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_METAL=OFF \ + -DCMAKE_BUILD_TYPE=Debug diff --git a/run.remoting.sh b/run.remoting.sh new file mode 100755 index 0000000000000..11ac21f4b1b14 --- /dev/null +++ b/run.remoting.sh @@ -0,0 +1,50 @@ +#! /bin/bash +#clear +if [[ ${1:-} == "strace" ]]; then + prefix="strace" +elif [[ ${1:-} == "gdb" ]]; then + prefix="gdb --args" +else + prefix="" +fi + +MODEL=${MODEL:-llama3.2} + +LLAMA_BUILD_DIR=../build.remoting-frontend$FLAVOR + +MODEL_HOME="$HOME/models" + +set -x +if [[ "${BENCH_MODE:-}" == "bench" ]]; then + cat < +#include +#include +#include +#include + +llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) { +} + +bool llama_kv_cache_unified::init( + const llama_model & model, + const llama_cparams & cparams, + ggml_type type_k, + ggml_type type_v, + uint32_t kv_size, + bool offload) { + const int32_t n_layer = hparams.n_layer; + + has_shift = false; + + recurrent = llama_model_is_recurrent(&model); + v_trans = !recurrent && !cparams.flash_attn; + can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + + LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", + __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); + + head = 0; + size = kv_size; + used = 0; + + this->type_k = type_k; + this->type_v = type_v; + + cells.clear(); + cells.resize(kv_size); + + // create a context for each buffer type + std::map ctx_map; + auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { + auto it = ctx_map.find(buft); + if (it == ctx_map.end()) { + ggml_init_params params = { + /*.mem_size =*/ size_t(2u*n_layer*ggml_tensor_overhead()), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ggml_context * ctx = ggml_init(params); + if (!ctx) { + return nullptr; + } + + ctx_map[buft] = ctx; + ctxs.emplace_back(ctx); + + return ctx; + } + + return it->second; + }; + + k_l.reserve(n_layer); + v_l.reserve(n_layer); + + for (int i = 0; i < n_layer; i++) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + + const char * dev_name = "CPU"; + + ggml_backend_buffer_type_t buft; + if (offload) { + auto * dev = model.dev_layer(i); + buft = ggml_backend_dev_buffer_type(dev); + + dev_name = ggml_backend_dev_name(dev); + } else { + buft = ggml_backend_cpu_buffer_type(); + } + + /* + LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, + i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + */ + + ggml_context * ctx = ctx_for_buft(buft); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__); + return false; + } + + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + k_l.push_back(k); + v_l.push_back(v); + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + auto * buft = it.first; + auto * ctx = it.second; + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; + } + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + bufs.emplace_back(buf); + } + + return true; +} + +int32_t llama_kv_cache_unified::get_n_tokens() const { + int32_t result = 0; + + for (uint32_t i = 0; i < size; i++) { + result += cells[i].seq_id.size(); + } + + return result; +} + +int32_t llama_kv_cache_unified::get_used_cells() const { + return used; +} + +size_t llama_kv_cache_unified::total_size() const { + size_t size = 0; + for (const auto & buf : bufs) { + size += ggml_backend_buffer_get_size(buf.get()); + } + + return size; +} + +llama_pos llama_kv_cache_unified::pos_max() const { + llama_pos pos_max = -1; + for (const auto & cell : cells) { + pos_max = std::max(pos_max, cell.pos); + } + + return pos_max; +} + +void llama_kv_cache_unified::clear() { + for (int32_t i = 0; i < (int32_t) size; ++i) { + cells[i].pos = -1; + cells[i].seq_id.clear(); + cells[i].src = -1; + cells[i].tail = -1; + } + head = 0; + used = 0; + + for (auto & buf : bufs) { + ggml_backend_buffer_clear(buf.get(), 0); + } +} + +bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // models like Mamba or RWKV can't have a state partially erased + if (recurrent) { + if (seq_id >= (int64_t) size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + int32_t & tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + const llama_kv_cell & cell = cells[tail_id]; + // partial intersection is invalid + if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) { + return false; + } + // invalidate tails which will be cleared + if (p0 <= cell.pos && cell.pos < p1) { + tail_id = -1; + } + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + + return true; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].pos >= p0 && cells[i].pos < p1) { + if (seq_id < 0) { + cells[i].seq_id.clear(); + } else if (cells[i].has_seq_id(seq_id)) { + cells[i].seq_id.erase(seq_id); + } else { + continue; + } + if (cells[i].is_empty()) { + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } + + return true; +} + +void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + if (seq_id_src == seq_id_dst) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + if (recurrent) { + if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) { + llama_kv_cell & tail_src = cells[seq_id_src]; + llama_kv_cell & tail_dst = cells[seq_id_dst]; + if (tail_dst.tail >= 0) { + // clear destination seq_id if it wasn't empty + llama_kv_cell & cell_dst = cells[tail_dst.tail]; + + cell_dst.seq_id.erase(seq_id_dst); + tail_dst.tail = -1; + if (cell_dst.seq_id.empty()) { + cell_dst.pos = -1; + cell_dst.delta = -1; + cell_dst.src = -1; + used -= 1; + } + } + if (tail_src.tail >= 0) { + llama_kv_cell & cell_src = cells[tail_src.tail]; + + cell_src.seq_id.insert(seq_id_dst); + tail_dst.tail = tail_src.tail; + } + } + + return; + } + + // otherwise, this is the KV of a Transformer-like model + head = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) { + cells[i].seq_id.insert(seq_id_dst); + } + } +} + +void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) { + uint32_t new_head = size; + + for (uint32_t i = 0; i < size; ++i) { + if (recurrent && (llama_seq_id) i != seq_id) { + cells[i].tail = -1; + } + + if (!cells[i].has_seq_id(seq_id)) { + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + cells[i].seq_id.clear(); + + if (new_head == size){ + new_head = i; + } + } else { + cells[i].seq_id.clear(); + cells[i].seq_id.insert(seq_id); + } + } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) { + if (delta == 0) { + return; + } + + uint32_t new_head = size; + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + } + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + cells[i].pos += delta; + cells[i].delta += delta; + + if (cells[i].pos < 0) { + if (!cells[i].is_empty()) { + used--; + } + cells[i].pos = -1; + cells[i].seq_id.clear(); + if (new_head == size) { + new_head = i; + } + } + } + } + + // If we freed up a slot, set head to it so searching can start there. + // Otherwise we just start the next search from the beginning. + head = new_head != size ? new_head : 0; +} + +void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + if (d == 1) { + return; + } + + if (p0 < 0) { + p0 = 0; + } + + if (p1 < 0) { + p1 = std::numeric_limits::max(); + } + + // If there is no range then return early to avoid looping over the cache. + if (p0 == p1) { + return; + } + + if (recurrent) { + // for Mamba-like or RWKV models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) size) { + const int32_t tail_id = cells[seq_id].tail; + if (tail_id >= 0) { + llama_kv_cell & cell = cells[tail_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + } + + return; + } + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) { + has_shift = true; + + { + llama_pos p_old = cells[i].pos; + cells[i].pos /= d; + cells[i].delta += cells[i].pos - p_old; + } + } + } +} + +llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const { + llama_pos result = 0; + + for (uint32_t i = 0; i < size; ++i) { + if (cells[i].has_seq_id(seq_id)) { + result = std::max(result, cells[i].pos); + } + } + + return result; +} + +void llama_kv_cache_unified::defrag() { + if (!recurrent) { + do_defrag = true; + } +} + +void llama_kv_cache_unified::restore() { + if (pending.ranges.empty()) { + return; + } + + // TODO: tmp - move to llama_kv_cache_recurrent + if (recurrent) { + seq_rm(-1, -1, -1); + return; + } + + uint32_t new_head = size; + + for (auto & range : pending.ranges) { + for (uint32_t i = range.c0; i < range.c1; ++i) { + cells[i].seq_id.clear(); + + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + } + + new_head = std::min(new_head, range.c0); + } + + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache_unified::commit() { + // TODO: tmp - move to llama_kv_cache_recurrent + if (recurrent) { + return; + } + + if (pending.ranges.empty()) { + LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n", + __func__, "https://github.com/ggml-org/llama.cpp/pull/12695"); + return; + } + + pending.ranges.clear(); +} + +bool llama_kv_cache_unified::get_can_shift() const { + return can_shift; +} + +bool llama_kv_cache_unified::find_slot( + const llama_ubatch & ubatch) { + const uint32_t n_tokens = ubatch.n_tokens; + const uint32_t n_seqs = ubatch.n_seqs; + const uint32_t n_seq_tokens = ubatch.n_seq_tokens; + + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (head > used + 2*ubatch.n_tokens) { + head = 0; + } + + if (recurrent) { + // For recurrent state architectures (like Mamba or RWKV), + // each cache cell can store the state for a whole sequence. + // A slot should be always be contiguous. + + // can only process batches with an equal number of new tokens in each sequence + GGML_ASSERT(ubatch.equal_seqs); + + int32_t min = size - 1; + int32_t max = 0; + + // everything should fit if all seq_ids are smaller than the max + for (uint32_t s = 0; s < n_seqs; ++s) { + const uint32_t n_seq_id = ubatch.n_seq_id[s]; + for (uint32_t j = 0; j < n_seq_id; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + + if (seq_id < 0 || (uint32_t) seq_id >= size) { + // too big seq_id + // TODO: would it be possible to resize the cache instead? + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); + return false; + } + if (j > 0) { + llama_kv_cell & seq = cells[seq_id]; + if (seq.tail >= 0) { + llama_kv_cell & cell = cells[seq.tail]; + // clear cells from seq_ids that become shared + // (should not normally happen, but let's handle it anyway) + cell.seq_id.erase(seq_id); + seq.tail = -1; + if (cell.seq_id.empty()) { + cell.pos = -1; + cell.src = -1; + used -= 1; + } + } + } + } + } + +#ifndef NDEBUG + { + std::vector tails_verif; + tails_verif.assign(size, -1); + for (uint32_t i = 0; i < size; ++i) { + llama_kv_cell & cell = cells[i]; + for (llama_seq_id seq_id : cell.seq_id) { + if (tails_verif[seq_id] != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]); + } + tails_verif[seq_id] = i; + } + } + for (uint32_t i = 0; i < size; ++i) { + if (tails_verif[i] != cells[i].tail) { + LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]); + } + } + } +#endif + + // find next empty cell + uint32_t next_empty_cell = head; + + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + + // find usable cell range + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch.seq_id[s][0]; + llama_kv_cell & seq_meta = cells[seq_id]; + bool has_cell = false; + if (seq_meta.tail >= 0) { + llama_kv_cell & cell = cells[seq_meta.tail]; + GGML_ASSERT(cell.has_seq_id(seq_id)); + // does this seq_id "own" the cell? + if (cell.seq_id.size() == 1) { has_cell = true; } + } + if (!has_cell) { + llama_kv_cell & empty_cell = cells[next_empty_cell]; + GGML_ASSERT(empty_cell.is_empty()); + // copy old tail into the empty cell + if (seq_meta.tail >= 0) { + llama_kv_cell & orig_cell = cells[seq_meta.tail]; + empty_cell.pos = orig_cell.pos; + empty_cell.src = orig_cell.src; + orig_cell.seq_id.erase(seq_id); + empty_cell.seq_id.insert(seq_id); // will be overwritten + } + seq_meta.tail = next_empty_cell; + // find next empty cell + if (s + 1 < n_seqs) { + next_empty_cell += 1; + for (uint32_t i = 0; i < size; ++i) { + if (next_empty_cell >= size) { next_empty_cell -= size; } + llama_kv_cell & cell = cells[next_empty_cell]; + if (cell.is_empty()) { break; } + next_empty_cell += 1; + } + } + } + if (min > seq_meta.tail) { min = seq_meta.tail; } + if (max < seq_meta.tail) { max = seq_meta.tail; } + } + + // gather and re-order + for (uint32_t s = 0; s < n_seqs; ++s) { + int32_t dst_id = s + min; + int32_t src_id = cells[ubatch.seq_id[s][0]].tail; + if (dst_id != src_id) { + llama_kv_cell & dst_cell = cells[dst_id]; + llama_kv_cell & src_cell = cells[src_id]; + + std::swap(dst_cell.pos, src_cell.pos); + std::swap(dst_cell.src, src_cell.src); + std::swap(dst_cell.seq_id, src_cell.seq_id); + + // swap tails (assuming they NEVER overlap) + for (const llama_seq_id seq_id : src_cell.seq_id) { + cells[seq_id].tail = src_id; + } + for (const llama_seq_id seq_id : dst_cell.seq_id) { + cells[seq_id].tail = dst_id; + } + } + } + + // update the pos of the used seqs + for (uint32_t s = 0; s < n_seqs; ++s) { + const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1]; + int32_t cell_id = s + min; + llama_kv_cell & cell = cells[cell_id]; + + if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) { + // What should happen when the pos backtracks or skips a value? + // Clearing the state mid-batch would require special-casing which isn't done. + LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n", + __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens); + } + cell.pos = last_pos; + cell.seq_id.clear(); + for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) { + const llama_seq_id seq_id = ubatch.seq_id[s][j]; + cell.seq_id.insert(seq_id); + cells[seq_id].tail = cell_id; + } + } + + // allow getting the range of used cells, from head to head + n + head = min; + n = max - min + 1; + used = std::count_if(cells.begin(), cells.end(), + [](const llama_kv_cell& cell){ return !cell.is_empty(); }); + + // sanity check + return n >= n_seqs; + } + + // otherwise, one cell per token. + + if (n_tokens > size) { + LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); + return false; + } + + uint32_t n_tested = 0; + + while (true) { + if (head + n_tokens > size) { + n_tested += size - head; + head = 0; + continue; + } + + bool found = true; + for (uint32_t i = 0; i < n_tokens; i++) { + if (cells[head + i].pos >= 0) { + found = false; + head += i + 1; + n_tested += i + 1; + break; + } + } + + if (found) { + break; + } + + if (n_tested >= size) { + //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); + return false; + } + } + + for (uint32_t s = 0; s < n_seqs; s++) { + for (uint32_t i = 0; i < n_seq_tokens; ++i) { + uint32_t k = s*n_seq_tokens + i; + cells[head + k].pos = ubatch.pos[k]; + + for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) { + cells[head + k].seq_id.insert(ubatch.seq_id[s][j]); + } + } + } + + used += n_tokens; + + pending.ranges.push_back({head, head + n_tokens}); + + return true; +} + +uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const { + // the FA kernels require padding to avoid extra runtime boundary checks + return cparams.flash_attn ? 256u : 32u; +} + +uint32_t llama_kv_cache_unified::cell_max() const { + for (uint32_t i = size; i > 0; --i) { + const llama_kv_cell & cell = cells[i - 1]; + + if (cell.pos >= 0 && !cell.is_empty()) { + return i; + } + } + + return 0; +} + +size_t llama_kv_cache_unified::size_k_bytes() const { + size_t size_k_bytes = 0; + + for (const auto & k : k_l) { + size_k_bytes += ggml_nbytes(k); + } + + return size_k_bytes; +} + +size_t llama_kv_cache_unified::size_v_bytes() const { + size_t size_v_bytes = 0; + + for (const auto & v : v_l) { + size_v_bytes += ggml_nbytes(v); + } + + return size_v_bytes; +} + +bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) { + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = cell_max(); + const uint32_t n_used = used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + auto & ids = defrag_info.ids; + + ids.clear(); + ids.resize(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return false; + } + + LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves); + + LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer); + + return true; +} + +void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id (or all, when -1) + uint32_t cell_range_begin = size; + for (uint32_t i = 0; i < size; ++i) { + const auto & cell = cells[i]; + if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == size) { + cell_range_begin = i; + } + } else { + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, i); + cell_range_begin = size; + } + } + } + if (cell_range_begin != size) { + cell_ranges.emplace_back(cell_range_begin, size); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + + io.write(&cell_count, sizeof(cell_count)); + + state_write_meta(io, cell_ranges, seq_id); + state_write_data(io, cell_ranges); +} + +void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) { + uint32_t cell_count; + io.read_to(&cell_count, sizeof(cell_count)); + + bool res = true; + res = res && state_read_meta(io, cell_count, seq_id); + res = res && state_read_data(io, cell_count); + + if (!res) { + if (seq_id == -1) { + clear(); + } else { + seq_rm(seq_id, -1, -1); + } + throw std::runtime_error("failed to restore kv cache"); + } +} + +void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id) const { + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = cells[i]; + const llama_pos pos = cell.pos; + const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0; + + io.write(&pos, sizeof(pos)); + io.write(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id) { + for (auto seq_id : cell.seq_id) { + io.write(&seq_id, sizeof(seq_id)); + } + } + } + } +} + +void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { + const uint32_t v_trans = this->v_trans ? 1 : 0; + const uint32_t n_layer = hparams.n_layer; + + io.write(&v_trans, sizeof(v_trans)); + io.write(&n_layer, sizeof(n_layer)); + + std::vector tmp_buf; + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Write key type + const int32_t k_type_i = (int32_t)k_l[il]->type; + io.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + io.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * k_size_row; + io.write_tensor(k_l[il], range.first * k_size_row, buf_size); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + io.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * v_size_row; + io.write_tensor(v_l[il], range.first * v_size_row, buf_size); + } + } + } else { + // When v is transposed, we also need the element size and get the element ranges from each row + const uint32_t kv_size = size; + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Write value type + const int32_t v_type_i = (int32_t)v_l[il]->type; + io.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const uint32_t v_size_el = ggml_type_size(v_l[il]->type); + io.write(&v_size_el, sizeof(v_size_el)); + + // Write GQA embedding size + io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + const size_t buf_size = range_size * v_size_el; + io.write_tensor(v_l[il], src_offset, buf_size); + } + } + } + } +} + +bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) { + if (dest_seq_id != -1) { + // single sequence + + seq_rm(dest_seq_id, -1, -1); + + llama_sbatch sbatch; + llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false); + + batch.n_tokens = cell_count; + batch.n_seq_tokens = cell_count; + batch.n_seqs = 1; + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + if (n_seq_id != 0) { + LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__); + return false; + } + + batch.pos[i] = pos; + } + batch.n_seq_id[0] = 1; + batch.seq_id[0] = &dest_seq_id; + if (!find_slot(batch)) { + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return false; + } + commit(); + + // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(head + cell_count <= size); + GGML_ASSERT(cells[head].pos == batch.pos[0]); + GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(cells[head].has_seq_id(dest_seq_id)); + GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id)); + } else { + // whole KV cache restore + + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__); + return false; + } + + clear(); + + for (uint32_t i = 0; i < cell_count; ++i) { + llama_kv_cell & cell = cells[i]; + + llama_pos pos; + uint32_t n_seq_id; + + io.read_to(&pos, sizeof(pos)); + io.read_to(&n_seq_id, sizeof(n_seq_id)); + + cell.pos = pos; + + for (uint32_t j = 0; j < n_seq_id; ++j) { + llama_seq_id seq_id; + io.read_to(&seq_id, sizeof(seq_id)); + + // TODO: llama_kv_cache_unified should have a notion of max sequences + //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { + if (seq_id < 0) { + //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + return false; + } + + cell.seq_id.insert(seq_id); + + if (recurrent) { + int32_t & tail = cells[seq_id].tail; + if (tail != -1) { + LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail); + return false; + } + tail = i; + } + } + } + + head = 0; + used = cell_count; + } + + if (recurrent) { + for (uint32_t i = 0; i < cell_count; ++i) { + uint32_t cell_id = head + i; + // make sure the recurrent states will keep their restored state + cells[cell_id].src = cell_id; + } + } + + return true; +} + +bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) { + uint32_t v_trans; + uint32_t n_layer; + io.read_to(&v_trans, sizeof(v_trans)); + io.read_to(&n_layer, sizeof(n_layer)); + + if (n_layer != hparams.n_layer) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + return false; + } + if (cell_count > size) { + LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size); + return false; + } + if (v_trans != (bool) v_trans) { + LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__); + return false; + } + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s(); + + // Read type of key + int32_t k_type_i_ref; + io.read_to(&k_type_i_ref, sizeof(k_type_i_ref)); + const int32_t k_type_i = (int32_t) k_l[il]->type; + if (k_type_i != k_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return false; + } + + // Read row size of key + uint64_t k_size_row_ref; + io.read_to(&k_size_row_ref, sizeof(k_size_row_ref)); + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row); + } + } + + if (!v_trans) { + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read row size of value + uint64_t v_size_row_ref; + io.read_to(&v_size_row_ref, sizeof(v_size_row_ref)); + const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row); + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (uint32_t il = 0; il < n_layer; ++il) { + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s(); + + // Read type of value + int32_t v_type_i_ref; + io.read_to(&v_type_i_ref, sizeof(v_type_i_ref)); + const int32_t v_type_i = (int32_t)v_l[il]->type; + if (v_type_i != v_type_i_ref) { + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return false; + } + + // Read element size of value + uint32_t v_size_el_ref; + io.read_to(&v_size_el_ref, sizeof(v_size_el_ref)); + const size_t v_size_el = ggml_type_size(v_l[il]->type); + if (v_size_el != v_size_el_ref) { + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il); + return false; + } + + // Read GQA embedding size + uint32_t n_embd_v_gqa_ref; + io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref)); + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il); + return false; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (head + j * size) * v_size_el; + ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el); + } + } + } + } + + return true; +} + +// +// kv cache view +// + +llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) { + llama_kv_cache_view result = { + /*.n_cells = */ 0, + /*.n_seq_max = */ n_seq_max, + /*.token_count = */ 0, + /*.used_cells = */ kv.get_used_cells(), + /*.max_contiguous = */ 0, + /*.max_contiguous_idx = */ -1, + /*.cells = */ nullptr, + /*.cells_sequences = */ nullptr, + }; + + return result; +} + +void llama_kv_cache_view_free(llama_kv_cache_view * view) { + if (view->cells != nullptr) { + free(view->cells); + view->cells = nullptr; + } + if (view->cells_sequences != nullptr) { + free(view->cells_sequences); + view->cells_sequences = nullptr; + } +} + +void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) { + // TODO: rework this in the future, for now quick hack + const llama_kv_cache_unified * kvu = dynamic_cast(kv); + if (kvu == nullptr) { + LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__); + return; + } + + if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) { + view->n_cells = int32_t(kvu->size); + void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells"); + view->cells = (llama_kv_cache_view_cell *)p; + p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells); + GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences"); + view->cells_sequences = (llama_seq_id *)p; + } + + const std::vector & kv_cells = kvu->cells; + llama_kv_cache_view_cell * c_curr = view->cells; + llama_seq_id * cs_curr = view->cells_sequences; + int32_t used_cells = 0; + int32_t token_count = 0; + int32_t curr_contig_idx = -1; + uint32_t max_contig = 0; + int32_t max_contig_idx = -1; + + for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) { + const size_t curr_size = kv_cells[i].seq_id.size(); + token_count += curr_size; + c_curr->pos = kv_cells[i].pos + kv_cells[i].delta; + + if (curr_size > 0) { + if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) { + max_contig = i - curr_contig_idx; + max_contig_idx = curr_contig_idx; + } + curr_contig_idx = -1; + } else if (curr_contig_idx < 0) { + curr_contig_idx = i; + } + + int seq_idx = 0; + for (const llama_seq_id it : kv_cells[i].seq_id) { + if (seq_idx >= view->n_seq_max) { + break; + } + cs_curr[seq_idx] = it; + seq_idx++; + } + if (seq_idx != 0) { + used_cells++; + } + for (; seq_idx < view->n_seq_max; seq_idx++) { + cs_curr[seq_idx] = -1; + } + } + if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) { + max_contig_idx = curr_contig_idx; + max_contig = kv_cells.size() - curr_contig_idx; + } + view->max_contiguous = max_contig; + view->max_contiguous_idx = max_contig_idx; + view->token_count = token_count; + view->used_cells = used_cells; + if (uint32_t(used_cells) != kvu->used) { + LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n", + __func__, kvu->used, used_cells); + } +} diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..2dabef1c9b0c3 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -676,7 +676,7 @@ llama_model_loader::llama_model_loader( } } - LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + //LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(meta.get(), i); @@ -693,7 +693,7 @@ llama_model_loader::llama_model_loader( } replace_all(value, "\n", "\\n"); - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -702,7 +702,7 @@ llama_model_loader::llama_model_loader( continue; } - LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1153,6 +1153,7 @@ std::string llama_model_loader::ftype_name() const { } void llama_model_loader::print_info() const { + return; LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); if (n_bytes < GiB) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a5853f8b12dc0..14f408e7ee8f2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1560,12 +1560,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); + //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); + //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); return {dev, &pimpl->gpu_buft_list.at(dev)}; }; @@ -4445,6 +4445,7 @@ uint64_t llama_model::n_elements() const { } void llama_model::print_info() const { + return; const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train); auto print_f = [](const std::function & f, uint32_t n) { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d90f1d6b1ea63..dc53b918e12c4 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1998,8 +1998,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else { // token is control, but not marked as EOG -> print a debug log if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) { - LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", - __func__, t.second, t.first.c_str()); + //LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", + // __func__, t.second, t.first.c_str()); } } } @@ -2768,6 +2768,7 @@ int32_t llama_vocab::impl::detokenize( } void llama_vocab::impl::print_info() const { + return; LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str()); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); @@ -3114,6 +3115,7 @@ std::string llama_vocab::detokenize(const std::vector & tokens, boo } void llama_vocab::print_info() const { + return; pimpl->print_info(); } diff --git a/tools/run/run.cpp b/tools/run/run.cpp index c65afd61e023c..e901eea61ba60 100644 --- a/tools/run/run.cpp +++ b/tools/run/run.cpp @@ -984,6 +984,34 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st response += piece; } +static long long timer_start = 0; +static long long timer_total = 0; +static long long timer_count = 0; + +static inline void start_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer_total += (timer_end - timer_start); + timer_count += 1; +} + +static void show_timer(void) { + double ms = timer_total/1000000; + double itl = ms/timer_count; + double speed = 1/itl * 1000; + + printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed); +} + + // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get()); @@ -993,10 +1021,15 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str return 1; } + int cr = atexit(show_timer); + assert(cr == 0); + // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; + while (true) { + start_timer(); check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { printe("failed to decode\n"); @@ -1018,6 +1051,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str // prepare the next batch with the sampled token batch = llama_batch_get_one(&new_token_id, 1); + stop_timer(); } printf(LOG_COL_DEFAULT);