diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
deleted file mode 100644
index b85bf5741e5a3..0000000000000
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-name: Bug (compilation)
-description: Something goes wrong when trying to compile llama.cpp.
-title: "Compile bug: "
-labels: ["bug-unconfirmed", "compilation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
-        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
-        by clearing `~/.cache/ccache` (on Linux).
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        84a07a17b1b08cf2b9747c633a2372782848a27f
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
-      placeholder: >
-        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Compile command
-      description: >
-        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
deleted file mode 100644
index 1ccef0793d45e..0000000000000
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-name: Bug (model use)
-description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
-title: "Eval bug: "
-labels: ["bug-unconfirmed", "model evaluation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the model evaluation results
-        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-cli` binary can be used for simple and reproducible model inference.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: hardware
-    attributes:
-      label: Hardware
-      description: Which CPUs/GPUs are you using?
-      placeholder: >
-        e.g. Ryzen 5950X + 2x RTX 4090
-    validations:
-      required: true
-  - type: textarea
-    id: model
-    attributes:
-      label: Models
-      description: >
-        Which model(s) at which quantization were you using when encountering the bug?
-        If you downloaded a GGUF file off of Huggingface, please provide a link.
-      placeholder: >
-        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
-        that information would be very much appreciated by us.
-      placeholder: >
-        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
-        When I use -ngl 0 it works correctly.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
deleted file mode 100644
index 1904e31fdc436..0000000000000
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-name: Bug (misc.)
-description: Something is not working the way it should (and it's not covered by any of the above cases).
-title: "Misc. bug: "
-labels: ["bug-unconfirmed"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for miscellaneous bugs that don't fit into any other category.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software is affected? (You can use `--version` to get a version string.)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: dropdown
-    id: module
-    attributes:
-      label: Which llama.cpp modules do you know to be affected?
-      multiple: true
-      options:
-        - Documentation/Github
-        - libllama (core library)
-        - llama-cli
-        - llama-server
-        - llama-bench
-        - llama-quantize
-        - Python/Bash scripts
-        - Test code
-        - Other (Please specify in the next section)
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Command line
-      description: >
-        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          If applicable, please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml
deleted file mode 100644
index cee1446f5a097..0000000000000
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp.
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml
deleted file mode 100644
index e774550d5908c..0000000000000
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area.
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml
deleted file mode 100644
index 2fe94e26c6988..0000000000000
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities.
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 0d246533c9515..0000000000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/actions/get-tag-name/action.yml b/.github/actions/get-tag-name/action.yml
deleted file mode 100644
index 7ace23b2a3e76..0000000000000
--- a/.github/actions/get-tag-name/action.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: "Determine tag name"
-description: "Determine the tag name to use for a release"
-outputs:
-  name:
-    description: "The name of the tag"
-    value: ${{ steps.tag.outputs.name }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Determine tag name
-      id: tag
-      shell: bash
-      run: |
-        BUILD_NUMBER="$(git rev-list --count HEAD)"
-        SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-        if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-          echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-        else
-          SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-          echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-        fi
diff --git a/.github/actions/windows-setup-cuda/action.yml b/.github/actions/windows-setup-cuda/action.yml
deleted file mode 100644
index 5575caeca31a2..0000000000000
--- a/.github/actions/windows-setup-cuda/action.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: "Windows - Setup CUDA Toolkit"
-description: "Setup CUDA Toolkit for Windows"
-inputs:
-  cuda_version:
-    description: "CUDA toolkit version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Cuda Toolkit 11.7
-      if: ${{ inputs.cuda_version == '11.7' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-    - name: Install Cuda Toolkit 12.4
-      if: ${{ inputs.cuda_version == '12.4' }}
-      shell: pwsh
-      run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml
deleted file mode 100644
index 446f799fac34a..0000000000000
--- a/.github/actions/windows-setup-curl/action.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: 'Windows - Setup CURL'
-description: 'Composite action, to be reused in other workflow'
-inputs:
-  curl_version:
-    description: 'CURL version'
-    required: false
-    default: '8.6.0_6'
-  architecture:
-    description: 'Architecture of the libcurl to download'
-    required: false
-    default: 'win64'
-outputs:
-  curl_path:
-    description: "Path to the downloaded libcurl"
-    value: ${{ steps.get_libcurl.outputs.curl_path }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: libCURL
-      id: get_libcurl
-      shell: powershell
-      env:
-        CURL_VERSION: ${{ inputs.curl_version }}
-        ARCHITECTURE: ${{ inputs.architecture }}
-      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-${env:ARCHITECTURE}-mingw.zip"
-        mkdir $env:RUNNER_TEMP/libcurl
-        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index 3c2f67707b024..0000000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-# https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute/**
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal/**
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl/**
-            - docs/backend/SYCL.md
-            - examples/sycl/**
-Nvidia GPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan/**
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/**
-            - tools/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/server/**
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
-
-Ascend NPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cann.h
-            - ggml/src/ggml-cann/**
-            - docs/backend/CANN.md
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
deleted file mode 100644
index d0bdd73c4439c..0000000000000
--- a/.github/pull_request_template.md
+++ /dev/null
@@ -1 +0,0 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled
deleted file mode 100644
index f2d7e16e981ac..0000000000000
--- a/.github/workflows/bench.yml.disabled
+++ /dev/null
@@ -1,304 +0,0 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
-#
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'tools/server/*.h*', 'tools/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || github.event_name == 'pull_request_target'
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd tools/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=tools/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd tools/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd tools/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch $HEAD_REF \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            tools/server/bench/*.jpg
-            tools/server/bench/*.json
-            tools/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            tools/server/bench/prompt_tokens_seconds.jpg
-            tools/server/bench/predicted_tokens_seconds.jpg
-            tools/server/bench/kv_cache_usage_ratio.jpg
-            tools/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd tools/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
deleted file mode 100644
index 7cfc82ba4e277..0000000000000
--- a/.github/workflows/build-linux-cross.yml
+++ /dev/null
@@ -1,346 +0,0 @@
-name: Build on Linux using cross-compiler
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-riscv64-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-arm64-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Arm64
-        run: |
-          sudo dpkg --add-architecture arm64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/arm64-ports.list
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-ppc64el-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-ppc64el-vulkan-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu \
-                  libvulkan-dev:ppc64el
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-cpu-cross:
-    runs-on: ubuntu-24.04
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  debian-13-loongarch64-vulkan-cross:
-    runs-on: ubuntu-24.04
-    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup LoongArch
-        run: |
-          rm -f /etc/apt/sources.list.d/*
-          cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
-          deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
-          EOF
-          ( echo 'quiet "true";'; \
-            echo 'APT::Get::Assume-Yes "true";'; \
-            echo 'APT::Install-Recommends "false";'; \
-            echo 'Acquire::Check-Valid-Until "false";'; \
-            echo 'Acquire::Retries "5";'; \
-          ) > /etc/apt/apt.conf.d/99snapshot-repos
-
-          apt-get update
-          apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
-          dpkg --add-architecture loong64
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
-          deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
-          EOF
-
-          apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-loongarch64-linux-gnu \
-                  g++-14-loongarch64-linux-gnu \
-                  libvulkan-dev:loong64
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
-                         -DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index c4783a6df8882..0000000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,1080 +0,0 @@
-name: CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  macOS-latest-cmake-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-  macOS-latest-cmake-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-cpu-cmake:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-llguidance:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          export GGML_VK_VISIBLE_DEVICES=0
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 3600
-
-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-hip
-          evict-old-files: 1d
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Build with legacy HIP support
-        id: cmake_build_legacy_hip
-        run: |
-          cmake -B build2 -S . \
-            -DCMAKE_C_COMPILER=hipcc \
-            -DCMAKE_CXX_COMPILER=hipcc \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build2 --config Release -j $(nproc)
-
-  ubuntu-22-cmake-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-musa
-          evict-old-files: 1d
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl-fp16
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          cmake --build build --config Release -j $(nproc)
-
-  build-linux-cross:
-    uses: ./.github/workflows/build-linux-cross.yml
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-ios
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-tvos
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-msys2
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-  windows-latest-cmake:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'cpu-x64 (static)'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
-          - build: 'openblas-x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-         # - build: 'kompute-x64'
-         #   defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Clone Kompute submodule
-        id: clone_kompute
-        if: ${{ matrix.build == 'kompute-x64' }}
-        run: |
-          git submodule update --init ggml/src/ggml-kompute/kompute
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-          cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      # TODO: disabled for now, consider adding tests for all CPU variants instead
-      # - name: Test (Intel SDE)
-      #   id: cmake_test_sde
-      #   if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-      #   run: |
-      #     curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-      #     # for some weird reason windows tar doesn't like sde tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-      #     $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-      #     cd build
-      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
-      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
-  ubuntu-latest-cmake-cuda:
-    runs-on: ubuntu-latest
-    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
-
-    steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v4
-
-        - name: Install dependencies
-          env:
-            DEBIAN_FRONTEND: noninteractive
-          run: |
-              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
-
-        - name: ccache
-          uses: hendrikmuhs/ccache-action@v1.2.16
-          with:
-            key: ubuntu-latest-cmake-cuda
-            evict-old-files: 1d
-
-        - name: Build with CMake
-          run: |
-            cmake -S . -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_CUDA_ARCHITECTURES=89-real \
-              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
-              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
-            cmake --build build
-
-  windows-2022-cmake-cuda:
-    runs-on: windows-2022
-
-    strategy:
-      matrix:
-        cuda: ['12.4']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-  windows-latest-cmake-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
-    runs-on: windows-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: android-build
-          evict-old-files: 1d
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-          ./gradlew build --no-daemon
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        cann:
-          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
deleted file mode 100644
index 276a217d45005..0000000000000
--- a/.github/workflows/close-issue.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Close inactive issues
-on:
-  schedule:
-    - cron: "42 0 * * *"
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  issues: write
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v5
-        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
-          days-before-issue-stale: 30
-          days-before-issue-close: 14
-          stale-issue-label: "stale"
-          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          operations-per-run: 10000
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 2067927be56ca..0000000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,178 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because it is expensive
-    - cron: '12 4 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-
-    runs-on: ubuntu-22.04
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          # Multi-stage build
-          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          # https://github.com/ggml-org/llama.cpp/issues/11888
-          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-
-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
-          # list all tags possible
-          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-              TYPE=""
-          else
-              TYPE="-${{ matrix.config.tag }}"
-          fi
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
-        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index f02b7c2194bcf..0000000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@v2
-        with:
-          version: v3.0.3
-      - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
deleted file mode 100644
index 3ca4d30581074..0000000000000
--- a/.github/workflows/gguf-publish.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 0b0f300aa402a..0000000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggml-org/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index 46e80aecd0a0c..0000000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
deleted file mode 100644
index ddfdf73b8fce2..0000000000000
--- a/.github/workflows/python-lint.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: flake8 Lint
-
-on:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            plugins: "flake8-no-print"
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
deleted file mode 100644
index 373bb601020b2..0000000000000
--- a/.github/workflows/python-type-check.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Python Type-Check
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Python dependencies
-        # TODO: use a venv
-        run: pip install -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.382
-          level: warning
-          warnings: true
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
deleted file mode 100644
index 9874736cbd8de..0000000000000
--- a/.github/workflows/release.yml
+++ /dev/null
@@ -1,749 +0,0 @@
-name: Release
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
-
-jobs:
-  macOS-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-22-cpu:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
-          # - build: 'arm64'
-          #   os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGGML_VULKAN=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  windows-cpu:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        include:
-          - arch: 'x64'
-          - arch: 'arm64'
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-cpu-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
-      - name: Build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }}
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=${{ matrix.arch == 'x64' && 'ON' || 'OFF' }} ^
-            -DGGML_OPENMP=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
-          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
-          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
-
-  windows:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - backend: 'vulkan'
-            arch: 'x64'
-            defines: '-DGGML_VULKAN=ON'
-            target: 'ggml-vulkan'
-          - backend: 'opencl-adreno'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-            target: 'ggml-opencl'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.backend == 'vulkan' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
-          cmake --build build --config Release --target ${{ matrix.target }}
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
-          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
-
-  windows-cuda:
-    runs-on: windows-2022
-
-    strategy:
-      matrix:
-        cuda: ['12.4']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-cuda-${{ matrix.cuda }}
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit
-        uses: ./.github/actions/windows-setup-cuda
-        with:
-          cuda_version: ${{ matrix.cuda }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_BACKEND_DL=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CPU=OFF ^
-            -DGGML_CUDA=ON ^
-            -DLLAMA_CURL=OFF
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
-
-  windows-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: ccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-          cmake -G "Ninja" -B build ^
-            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
-            -DCMAKE_BUILD_TYPE=Release ^
-            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
-            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-sycl -j
-
-      - name: Build the release package
-        id: pack_artifacts
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-hip:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        include:
-          - name: "radeon"
-            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_BACKEND_DL=ON `
-            -DGGML_NATIVE=OFF `
-            -DGGML_CPU=OFF `
-            -DAMDGPU_TARGETS="${{ matrix.gpu_targets }}" `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DLLAMA_CURL=OFF
-          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-bin-win-hip-${{ matrix.name }}-x64.zip
-          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TOOLS=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    # Fine-grant permission
-    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-    permissions:
-        contents: write # for creating release
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - windows
-      - windows-cpu
-      - windows-cuda
-      - windows-sycl
-      - windows-hip
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
-      - macOS-arm64
-      - macOS-x64
-      - ios-xcode-build
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-          merge-multiple: true
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: |
-          mkdir -p release
-
-          echo "Adding CPU backend files to existing zips..."
-          for arch in x64 arm64; do
-            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
-            temp_dir=$(mktemp -d)
-            echo "Extracting CPU backend for $arch..."
-            unzip "$cpu_zip" -d "$temp_dir"
-
-            echo "Adding CPU files to $arch zips..."
-            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
-              if [[ "$target_zip" == "$cpu_zip" ]]; then
-                continue
-              fi
-              echo "Adding CPU backend to $(basename "$target_zip")"
-              realpath_target_zip=$(realpath "$target_zip")
-              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
-            done
-
-            rm -rf "$temp_dir"
-          done
-
-          echo "Renaming and moving zips to release..."
-          for zip_file in artifact/llama-bin-win-*.zip; do
-            base_name=$(basename "$zip_file" .zip)
-            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
-            echo "Moving $zip_file to release/$zip_name"
-            mv "$zip_file" "release/$zip_name"
-          done
-
-          echo "Moving other artifacts..."
-          mv -v artifact/*.zip release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./release/${file}`)
-                });
-              }
-            }
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
deleted file mode 100644
index f6da488576937..0000000000000
--- a/.github/workflows/server.yml
+++ /dev/null
@@ -1,237 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libcurl4-openssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22.11.0'
-
-      - name: WebUI - Install dependencies
-        id: webui_lint
-        run: |
-          cd tools/server/webui
-          npm ci
-
-      - name: WebUI - Check code format
-        id: webui_format
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
-
-          npm run format
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Verify bundled index.html
-        id: verify_server_index_html
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
-
-          npm run build
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
-
-
-  server-windows:
-    runs-on: windows-2022
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      - name: Copy Libcurl
-        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
-        run: |
-          cd tools/server/tests
-          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          $env:SLOW_TESTS = "1"
-          pytest -v -x
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
deleted file mode 100644
index 5c286155951e5..0000000000000
--- a/.github/workflows/winget.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Update Winget Package
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    - cron: '28 5 * * *' # Update every day at 5:28 UTC
-
-jobs:
-  update:
-    name: Update Winget Package
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Install cargo binstall
-        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
-
-      - name: Install komac
-        run: |
-          cargo binstall komac@2.11.2 -y
-
-      - name: Find latest release
-        id: find_latest_release
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const { data: releases } = await github.rest.repos.listReleases({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-            });
-            console.log("Latest release:", releases[0].tag_name);
-            return releases[0].tag_name;
-
-      - name: Update manifest
-        env:
-          VERSION: ${{ steps.find_latest_release.outputs.result }}
-        run: |
-          echo "Updating manifest..."
-          komac update --version ${{ env.VERSION }} \
-            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
-            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
-            --submit \
-            ggml.llamacpp
diff --git a/CMakePresets.json b/CMakePresets.json
index e9844701304fc..0228fc71edfc6 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -30,6 +30,8 @@
     { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
+    { "name": "remoting_backend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,
diff --git a/Makefile b/Makefile
index ac442aec095d6..128c684c5edd5 100644
--- a/Makefile
+++ b/Makefile
@@ -716,6 +716,16 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 	$(NVCC_COMPILE)
 endif # GGML_CUDA
 
+ifdef GGML_REMOTING_FRONTEND
+	MK_CPPFLAGS  += -DGGML_USE_REMOTINGFRONTEND
+	OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o
+endif
+
+ifdef GGML_REMOTING_BACKEND
+	MK_CPPFLAGS  += -DGGML_USE_REMOTINGBACKEND
+	OBJ_GGML_EXT += ggml/src/ggml-remotingbackend/ggml-remoting-backend.o
+endif
+
 ifdef GGML_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
@@ -755,6 +765,12 @@ _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 
+ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ggml/src/ggml-remotingbackend/backend.o: ggml/src/ggml-remotingbackend/backend.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(_ggml_vk_header): $(_ggml_vk_source)
 
 $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
diff --git a/build-xcframework.sh b/build-xcframework.sh
deleted file mode 100755
index a08419a801b47..0000000000000
--- a/build-xcframework.sh
+++ /dev/null
@@ -1,541 +0,0 @@
-#!/bin/bash
-#
-# Options
-IOS_MIN_OS_VERSION=16.4
-MACOS_MIN_OS_VERSION=13.3
-VISIONOS_MIN_OS_VERSION=1.0
-TVOS_MIN_OS_VERSION=16.4
-
-BUILD_SHARED_LIBS=OFF
-LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TOOLS=OFF
-LLAMA_BUILD_TESTS=OFF
-LLAMA_BUILD_SERVER=OFF
-GGML_METAL=ON
-GGML_METAL_EMBED_LIBRARY=ON
-GGML_BLAS_DEFAULT=ON
-GGML_METAL_USE_BF16=ON
-GGML_OPENMP=OFF
-
-COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-
-# Common options for all builds
-COMMON_CMAKE_ARGS=(
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
-    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
-    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
-    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
-    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
-    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
-    -DGGML_METAL=${GGML_METAL}
-    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
-    -DGGML_NATIVE=OFF
-    -DGGML_OPENMP=${GGML_OPENMP}
-)
-
-XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
-MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
-MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
-echo "Detected Xcode version: $XCODE_VERSION"
-
-check_required_tool() {
-    local tool=$1
-    local install_message=$2
-
-    if ! command -v $tool &> /dev/null; then
-        echo "Error: $tool is required but not found."
-        echo "$install_message"
-        exit 1
-    fi
-}
-echo "Checking for required tools..."
-check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
-check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
-check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-
-set -e
-
-## Clean up previous builds
-rm -rf build-apple
-rm -rf build-ios-sim
-rm -rf build-ios-device
-rm -rf build-macos
-rm -rf build-visionos
-rm -rf build-visionos-sim
-rm -rf build-tvos-sim
-rm -rf build-tvos-device
-
-# Setup the xcframework build directory structure
-setup_framework_structure() {
-    local build_dir=$1
-    local min_os_version=$2
-    local platform=$3  # "ios", "macos", "visionos", or "tvos"
-    local framework_name="llama"
-
-    echo "Creating ${platform}-style framework structure for ${build_dir}"
-
-    if [[ "$platform" == "macos" ]]; then
-        # macOS versioned structure uses versioned directories
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
-
-        # Create symbolic links
-        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
-        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
-        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
-        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
-        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
-    else
-        # iOS/VisionOS/tvOS use a flat structure
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
-
-        # Remove any existing structure to ensure clean build
-        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
-    fi
-
-    # Copy all required headers (common for all platforms)
-    cp include/llama.h             ${header_path}
-    cp ggml/include/ggml.h         ${header_path}
-    cp ggml/include/ggml-opt.h     ${header_path}
-    cp ggml/include/ggml-alloc.h   ${header_path}
-    cp ggml/include/ggml-backend.h ${header_path}
-    cp ggml/include/ggml-metal.h   ${header_path}
-    cp ggml/include/ggml-cpu.h     ${header_path}
-    cp ggml/include/ggml-blas.h    ${header_path}
-    cp ggml/include/gguf.h         ${header_path}
-
-    # Create module map (common for all platforms)
-    cat > ${module_path}module.modulemap << EOF
-framework module llama {
-    header "llama.h"
-    header "ggml.h"
-    header "ggml-alloc.h"
-    header "ggml-backend.h"
-    header "ggml-metal.h"
-    header "ggml-cpu.h"
-    header "ggml-blas.h"
-    header "gguf.h"
-
-    link "c++"
-    link framework "Accelerate"
-    link framework "Metal"
-    link framework "Foundation"
-
-    export *
-}
-EOF
-
-    # Platform-specific settings for Info.plist
-    local platform_name=""
-    local sdk_name=""
-    local supported_platform=""
-
-    case "$platform" in
-        "ios")
-            platform_name="iphoneos"
-            sdk_name="iphoneos${min_os_version}"
-            supported_platform="iPhoneOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>1</integer>
-        <integer>2</integer>
-    </array>'
-            ;;
-        "macos")
-            platform_name="macosx"
-            sdk_name="macosx${min_os_version}"
-            supported_platform="MacOSX"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
-            local device_family=""
-            ;;
-        "visionos")
-            platform_name="xros"
-            sdk_name="xros${min_os_version}"
-            supported_platform="XRPlatform"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family=""
-            ;;
-        "tvos")
-            platform_name="appletvos"
-            sdk_name="appletvos${min_os_version}"
-            supported_platform="AppleTVOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>3</integer>
-    </array>'
-            ;;
-    esac
-
-    # Create Info.plist
-    cat > ${plist_path} << EOF
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>CFBundleDevelopmentRegion</key>
-    <string>en</string>
-    <key>CFBundleExecutable</key>
-    <string>llama</string>
-    <key>CFBundleIdentifier</key>
-    <string>org.ggml.llama</string>
-    <key>CFBundleInfoDictionaryVersion</key>
-    <string>6.0</string>
-    <key>CFBundleName</key>
-    <string>llama</string>
-    <key>CFBundlePackageType</key>
-    <string>FMWK</string>
-    <key>CFBundleShortVersionString</key>
-    <string>1.0</string>
-    <key>CFBundleVersion</key>
-    <string>1</string>
-    <key>MinimumOSVersion</key>
-    <string>${min_os_version}</string>
-    <key>CFBundleSupportedPlatforms</key>
-    <array>
-        <string>${supported_platform}</string>
-    </array>${device_family}
-    <key>DTPlatformName</key>
-    <string>${platform_name}</string>
-    <key>DTSDKName</key>
-    <string>${sdk_name}</string>
-</dict>
-</plist>
-EOF
-}
-
-# Create dynamic libraries from static libraries.
-combine_static_libraries() {
-    local build_dir="$1"
-    local release_dir="$2"
-    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
-    local is_simulator="$4"
-    local base_dir="$(pwd)"
-    local framework_name="llama"
-
-    # Determine output path based on platform
-    local output_lib=""
-    if [[ "$platform" == "macos" ]]; then
-        # macOS uses versioned structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
-    else
-        # iOS, visionOS, and tvOS use a directory flat structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
-    fi
-
-    local libs=(
-        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
-    )
-
-    # Create temporary directory for processing
-    local temp_dir="${base_dir}/${build_dir}/temp"
-    mkdir -p "${temp_dir}"
-
-    # Since we have multiple architectures libtool will find object files that do not
-    # match the target architecture. We suppress these warnings.
-    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
-
-    # Determine SDK, architectures, and install_name based on platform and simulator flag.
-    local sdk=""
-    local archs=""
-    local min_version_flag=""
-    local install_name=""
-
-    case "$platform" in
-        "ios")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="iphonesimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
-            else
-                sdk="iphoneos"
-                archs="arm64"
-                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "macos")
-            sdk="macosx"
-            archs="arm64 x86_64"
-            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
-            install_name="@rpath/llama.framework/Versions/Current/llama"
-            ;;
-        "visionos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="xrsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
-            else
-                sdk="xros"
-                archs="arm64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
-            fi
-            # Use flat structure for visionOS, same as iOS
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "tvos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="appletvsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
-            else
-                sdk="appletvos"
-                archs="arm64"
-                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-    esac
-
-    # Build architecture flags
-    local arch_flags=""
-    for arch in $archs; do
-        arch_flags+=" -arch $arch"
-    done
-
-    # Create dynamic library
-    echo "Creating dynamic library for ${platform}."
-    xcrun -sdk $sdk clang++ -dynamiclib \
-        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
-        $arch_flags \
-        $min_version_flag \
-        -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
-        -install_name "$install_name" \
-        -o "${base_dir}/${output_lib}"
-
-    # Platform-specific post-processing for device builds
-    if [[ "$is_simulator" == "false" ]]; then
-        if command -v xcrun vtool &>/dev/null; then
-            case "$platform" in
-                "ios")
-                    echo "Marking binary as a framework binary for iOS..."
-                    xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "visionos")
-                    echo "Marking binary as a framework binary for visionOS..."
-                    if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
-                        echo "Xcode version greater than 16.2, using visionOS."
-                        VISION_OS_BUILD_VERSION="visionos"
-                    else
-                        echo "Xcode version less than or equal to 16.2, using xros."
-                        VISION_OS_BUILD_VERSION="xros"
-                    fi
-                    xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "tvos")
-                    echo "Marking binary as a framework binary for tvOS..."
-                    xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-            esac
-        else
-            echo "Warning: vtool not found. Binary may not pass App Store validation."
-        fi
-    fi
-
-    echo "Creating properly formatted dSYM..."
-    # Create a separate directory for dSYMs for all platforms
-    mkdir -p "${base_dir}/${build_dir}/dSYMs"
-
-    # iOS and visionOS style dSYM (flat structure)
-    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Create a copy of the binary that will be stripped
-        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
-
-        # Strip debug symbols from the copy
-        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
-
-        # Replace the original with the stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    else
-        # macOS style dSYM
-        # First strip debug info to a separate file
-        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
-
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Replace original binary with stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    fi
-
-    # Remove any automatically generated dSYM files in the framework structure as they will
-    # otherwise case Invalid Bundle Structure validation errors.
-    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
-        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
-        rm -rf "${base_dir}/${output_lib}.dSYM"
-    fi
-
-    # Clean up
-    rm -rf "${temp_dir}"
-}
-
-echo "Building for iOS simulator..."
-cmake -B build-ios-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DIOS=ON \
-    -DCMAKE_SYSTEM_NAME=iOS \
-    -DCMAKE_OSX_SYSROOT=iphonesimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-sim --config Release -- -quiet
-
-echo "Building for iOS devices..."
-cmake -B build-ios-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_SYSROOT=iphoneos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-device --config Release -- -quiet
-
-echo "Building for macOS..."
-cmake -B build-macos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-macos --config Release -- -quiet
-
-echo "Building for visionOS..."
-cmake -B build-visionos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xros \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos --config Release -- -quiet
-
-echo "Building for visionOS simulator..."
-cmake -B build-visionos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xrsimulator \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos-sim --config Release -- -quiet
-
-# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
-echo "Building for tvOS simulator..."
-cmake -B build-tvos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvsimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-sim --config Release -- -quiet
-
-echo "Building for tvOS devices..."
-cmake -B build-tvos-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-device --config Release -- -quiet
-
-# Setup frameworks and copy binaries and headers
-echo "Setting up framework structures..."
-setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
-setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
-setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
-
-# Create dynamic libraries from static libraries
-echo "Creating dynamic libraries from static libraries..."
-combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
-combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
-combine_static_libraries "build-macos" "Release" "macos" "false"
-combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
-combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
-combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
-combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
-
-# Create XCFramework with correct debug symbols paths
-echo "Creating XCFramework..."
-xcodebuild -create-xcframework \
-    -framework $(pwd)/build-ios-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-ios-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-macos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
-    -framework $(pwd)/build-visionos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
-    -output $(pwd)/build-apple/llama.xcframework
diff --git a/build.backend.sh b/build.backend.sh
new file mode 100755
index 0000000000000..2904c4a15c73f
--- /dev/null
+++ b/build.backend.sh
@@ -0,0 +1,36 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY_backend FAILED_backend
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
+
+if [[ "$FLAVOR" == "-prod" ]]; then
+    cat <<EOF
+###
+### Building the prod flavor
+###
+EOF
+fi
+
+TARGETS="llama-run"
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $TARGETS "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY_backend
+else
+    touch FAILED_backend
+fi
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4e7399f9e68f9..29d86c792747c 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -180,6 +180,8 @@ option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug ou
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
+option(GGML_REMOTING_BACKEND                "ggml: use the API Remoting backend"              OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
@@ -271,6 +273,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h
new file mode 100644
index 0000000000000..4c7cd585ea4af
--- /dev/null
+++ b/ggml/include/ggml-remoting-frontend.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 0c453741b5d84..9f090b54c405d 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -354,6 +354,8 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(RemotingFrontend)
+ggml_add_backend(RemotingBackend)
 ggml_add_backend(OpenCL)
 
 foreach (target ggml-base ggml)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 2d93771fd1cc0..be9467a8750fe 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -45,6 +45,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGFRONTEND
+#include "ggml-remoting-frontend.h"
+#endif
+
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -177,6 +181,10 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_REMOTINGFRONTEND
+        register_backend(ggml_backend_remoting_frontend_reg());
+#endif
+
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
@@ -580,6 +588,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("remoting_frontend", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 4e7f373cb435a..c874cdb66f87c 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -8,6 +8,9 @@
 
 #import <Metal/Metal.h>
 
+#undef GGML_LOG_DEBUG
+#define GGML_LOG_DEBUG(...)
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -1076,8 +1079,6 @@ @implementation GGMLMetalClass
                 GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                 return NULL; \
             } \
-        } else { \
-            GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
         const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;
@@ -5139,9 +5140,53 @@ static bool ggml_metal_encode_node(
     return true;
 }
 
+long long timer_start;
+long long timer_total;
+long long timer_count;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
+
+  printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed);
+
+  timer_start = 0;
+  timer_total = 1; // to avoid re-registering
+  timer_count = 0;
+}
+
+static void show_timer_signal(int sig) {
+  GGML_UNUSED(sig);
+  show_timer();
+}
+
 static enum ggml_status ggml_metal_graph_compute(
             ggml_backend_t   backend,
         struct ggml_cgraph * gf) {
+
+  if (timer_total == 0) {
+    signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid)
+    atexit(show_timer);
+  }
+
+  start_timer();
+
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -5269,6 +5314,8 @@ static enum ggml_status ggml_metal_graph_compute(
         }
     }
 
+  stop_timer();
+
     return GGML_STATUS_SUCCESS;
 }
 
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
new file mode 100644
index 0000000000000..7e374d395f68c
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting backend")
+
+ggml_add_backend_library(ggml-remotingbackend
+                         backend.cpp
+                         backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
+                         backend-dispatched-buffer-type.cpp
+                         backend-utils.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/venus_cs.h
+                         venus_cs_ggml-rpc-back.cpp
+                        )
+
+target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
new file mode 100644
index 0000000000000..b45c2784160ac
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -0,0 +1,15 @@
+#include "shared/apir_backend.h"
+
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_host_handle_t) buffer;
+}
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
new file mode 100644
index 0000000000000..f15f39c7f92d8
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -0,0 +1,57 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "shared/apir_backend.h"
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+uint32_t
+backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&graph_compute_timer);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+  size_t cgraph_size;
+  vn_decode_size_t(dec, &cgraph_size);
+
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
+
+  ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+  for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+    ggml_tensor *op = ggml_graph_node(cgraph, idx);
+    if (dev->iface.supports_op(dev, op)) {
+      continue;
+    }
+    ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op));
+
+    status = GGML_STATUS_ABORTED;
+    vn_encode_ggml_status(enc, &status);
+
+    stop_timer(&graph_compute_timer);
+    return 0;
+  }
+#endif
+  status = bck->iface.graph_compute(bck, cgraph);
+
+  vn_encode_ggml_status(enc, &status);
+
+  stop_timer(&graph_compute_timer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
new file mode 100644
index 0000000000000..f925d1e066fc0
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -0,0 +1,81 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t
+backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  const char *string = buft->iface.get_name(buft);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_alignment(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t value = buft->iface.get_max_size(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  bool is_host = buft->iface.is_host(buft);
+  vn_encode_bool_t(enc, &is_host);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buffer_type(dec);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  ggml_backend_buffer_t buffer;
+
+  buffer = buft->iface.alloc_buffer(buft, size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
new file mode 100644
index 0000000000000..fc1ccaef6748d
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -0,0 +1,143 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
+uint32_t
+backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+  vn_encode_uintptr_t(enc, &base);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&set_tensor_timer);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+#if 0
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+       buffer, tensor, shmem_data, offset, size);
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t) shmem_data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
+
+  buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
+
+  stop_timer(&set_tensor_timer);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  start_timer(&get_tensor_timer);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+
+  const ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = vn_decode_ggml_tensor(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+    if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
+
+  stop_timer(&get_tensor_timer);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uint8_t value;
+  vn_decode_uint8_t(dec, &value);
+
+  buffer->iface.clear(buffer, value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  if (!untrack_backend_buffer(buffer)) {
+    WARNING("%s: unknown buffer %p", (void *) buffer);
+    return 1;
+  }
+
+  buffer->iface.free_buffer(buffer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
new file mode 100644
index 0000000000000..473e9d2db7089
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -0,0 +1,142 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  int32_t dev_count = reg->iface.get_device_count(reg);
+  vn_encode_int32_t(enc, &dev_count);
+
+  return 0;
+}
+
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_name(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_description(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  uint32_t type = dev->iface.get_type(dev);
+  vn_encode_uint32_t(enc, &type);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+
+  vn_encode_size_t(enc, &free);
+  vn_encode_size_t(enc, &total);
+
+  return 0;
+}
+
+uint32_t
+backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
+
+  bool supports_op = dev->iface.supports_op(dev, op);
+
+  vn_encode_bool_t(enc, &supports_op);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
+
+  vn_encode_ggml_buffer_type(enc, bufft);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  struct ggml_backend_dev_props props;
+  dev->iface.get_props(dev, &props);
+
+  vn_encode_bool_t(enc, &props.caps.async);
+  vn_encode_bool_t(enc, &props.caps.host_buffer);
+  vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr);
+  vn_encode_bool_t(enc, &props.caps.events);
+
+  return 0;
+}
+
+uint32_t
+backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_ptr) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+  size_t max_tensor_size;
+  vn_decode_size_t(dec, &max_tensor_size);
+
+  ggml_backend_buffer_t buffer;
+  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+  vn_encode_ggml_buffer_type(enc, buffer->buft);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
new file mode 100644
index 0000000000000..d90424a3d714f
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -0,0 +1,47 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "ggml-metal.h"
+
+ggml_backend_reg_t reg = NULL;
+ggml_backend_dev_t dev = NULL;
+ggml_backend_t bck = NULL;
+
+long long timer_start = 0;
+long long timer_total = 0;
+long long timer_count = 0;
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
+  if (reg != NULL) {
+    FATAL("%s: already initialized :/", __func__);
+  }
+  ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
+
+  reg = ggml_backend_reg_fct();
+  if (reg == NULL) {
+    FATAL("%s: backend registration failed :/", __func__);
+  }
+
+  if (reg->iface.get_device_count(reg)) {
+    dev = reg->iface.get_device(reg, 0);
+  }
+
+  ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p;
+
+  bck = ggml_backend_fct(0);
+  if (!bck) {
+    ERROR("%s: backend initialization failed :/", __func__);
+    return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
+  }
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+  WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
+
+  return APIR_BACKEND_INITIALIZE_SUCCESSS;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
new file mode 100644
index 0000000000000..3c164b532ac95
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <ggml-backend.h>
+
+#include "backend-utils.h"
+#include "backend-convert.h"
+#include "shared/apir_backend.h"
+#include "shared/venus_cs.h"
+#include "shared/venus_cs_ggml.h"
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
+
+typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* *** */
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* device */
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* buffer-type */
+uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* buffer */
+uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+/* backend */
+uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
+static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
+{
+  switch (type) {
+  /* device */
+  case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count";
+  case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name";
+  case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
+  case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
+  case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
+  case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props";
+  case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr";
+
+  /* buffer-type */
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer";
+
+  /* buffer */
+  case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
+  case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear";
+  case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer";
+
+  /* backend */
+  case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute";
+  default: return "unknown";
+  }
+}
+
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
+  /* device */
+  [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count,
+  [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name,
+  [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
+  [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
+  [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
+  [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props,
+  [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr,
+
+  /* buffer-type */
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer,
+
+  /* buffer */
+  [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base,
+  [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear,
+  [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer,
+
+  /* backend */
+  [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute,
+};
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
new file mode 100644
index 0000000000000..41bc42dbc0e36
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -0,0 +1,29 @@
+#include <cstdio>
+#include <cstdarg>
+#include <cstdlib>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+extern ggml_backend_reg_t reg;
+extern ggml_backend_dev_t dev;
+extern ggml_backend_t bck;
+
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
+
+extern "C" {
+  uint32_t apir_backend_initialize();
+  void apir_backend_deinit(void);
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after);
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h
new file mode 100644
index 0000000000000..b032061a96947
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-utils.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstdarg>
+#include <cstdio>
+#include <cassert>
+
+#include <ggml.h>
+
+#define UNUSED GGML_UNUSED
+
+inline void
+INFO(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+WARNING(const char *format, ...) {
+  fprintf(stderr, "WARNING: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+ERROR(const char *format, ...) {
+  fprintf(stderr, "ERROR: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+FATAL(const char *format, ...) {
+  fprintf(stderr, "FATAL: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+  if (format)
+    assert(false);
+}
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
new file mode 100644
index 0000000000000..95dee556cff3f
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -0,0 +1,134 @@
+#include <iostream>
+#include <dlfcn.h>
+
+#include <ggml-backend.h>
+
+#include "backend-utils.h"
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "shared/apir_backend.h"
+#include "shared/venus_cs.h"
+
+#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
+#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
+#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT"
+
+
+static void *backend_library_handle = NULL;
+
+extern "C" {
+  void apir_backend_deinit(void) {
+    auto buffers = get_track_backend_buffers();
+    for (const auto& buffer: buffers) {
+      untrack_backend_buffer(buffer);
+      buffer->iface.free_buffer(buffer);
+    }
+
+    size_t free, total;
+    dev->iface.get_memory(dev, &free, &total);
+    WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
+
+    show_timer(&graph_compute_timer);
+    show_timer(&set_tensor_timer);
+    show_timer(&get_tensor_timer);
+    /* *** */
+
+    if (backend_library_handle) {
+      INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
+      dlclose(backend_library_handle);
+    }
+
+    INFO("%s: bye-bye", __func__);
+  }
+
+  uint32_t apir_backend_initialize() {
+    const char* dlsym_error;
+
+    const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV);
+    const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV);
+    const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV);
+
+    INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init);
+
+    if (!library_name) {
+      ERROR("Cannot open library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV);
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    backend_library_handle = dlopen(library_name, RTLD_LAZY);
+
+    if (!backend_library_handle) {
+      ERROR("Cannot open library: %s\n", dlerror());
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    if (!library_reg) {
+      ERROR("Cannot register library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_REG_ENV);
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("Cannot load symbol: %s\n", dlsym_error);
+
+      return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
+    }
+
+    if (!library_init) {
+      ERROR("Cannot initialize library: env var '%s' not defined\n", library_init);
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init);
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("Cannot load symbol: %s\n", dlsym_error);
+
+      return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
+    }
+
+    return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
+  }
+
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after) {
+    struct vn_cs_encoder _enc = {
+      .cur = enc_cur,
+      .end = enc_end,
+    };
+    struct vn_cs_encoder *enc = &_enc;
+
+    struct vn_cs_decoder _dec = {
+      .cur = dec_cur,
+      .end = dec_end,
+    };
+    struct vn_cs_decoder *dec = &_dec;
+
+
+    if (cmd_type > APIR_BACKEND_DISPATCH_TABLE_COUNT) {
+      ERROR("Received an invalid dispatch index (%d > %d)\n",
+	    cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
+      return APIR_BACKEND_FORWARD_INDEX_INVALID;
+    }
+
+#if 0
+    static long long count = 0;
+    INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type));
+    count += 1;
+#endif
+    backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
+    uint32_t ret = forward_fct(enc, dec, ctx);
+
+    *enc_cur_after = enc->cur;
+
+    return ret;
+  }
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
new file mode 100644
index 0000000000000..6e594a8ae4ab8
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
@@ -0,0 +1,12 @@
+#define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255
+#define VIRGL_APIR_COMMAND_TYPE_Forward 256
+
+
+static inline const char *api_remoting_command_name(int32_t type)
+{
+  switch (type) {
+  case VIRGL_APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
+  case VIRGL_APIR_COMMAND_TYPE_Forward: return "Forward";
+  default: return "unknown";
+  }
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
new file mode 100644
index 0000000000000..80e5961ff04b5
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#define APIR_BACKEND_INITIALIZE_SUCCESSS 0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5
+
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
+
+// 1 is fast, 0 avoid micro-benchmark crashes
+#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 1
+
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
+typedef uintptr_t apir_buffer_type_host_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+typedef struct {
+  apir_buffer_host_handle_t host_handle;
+
+  struct vn_renderer_shmem *shmem;
+  apir_buffer_type_host_handle_t buft_host_handle;
+} apir_buffer_context_t;
+
+struct vn_dispatch_context;
+struct virgl_apir_context;
+
+typedef enum ApirBackendCommandType {
+  /* device */
+  APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
+  APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1,
+  APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
+  APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
+  APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
+  APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
+  APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6,
+  APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7,
+  APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8,
+
+  /* buffer-type */
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13,
+
+  /* buffer */
+  APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14,
+  APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15,
+  APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16,
+  APIR_COMMAND_TYPE_BUFFER_CLEAR = 17,
+  APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18,
+
+  /* backend */
+  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19,
+
+  // last command_type index + 1
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 20,
+} ApirBackendCommandType;
+
+
+struct virgl_apir_callbacks {
+  void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id);
+};
+
+struct virgl_apir_context {
+  struct vn_dispatch_context *virgl_ctx;
+
+  struct virgl_apir_callbacks iface;
+};
+
+struct timer_data {
+  long long start;
+  long long total;
+  long long count;
+  const char *name;
+};
+
+extern struct timer_data graph_compute_timer;
+extern struct timer_data get_tensor_timer;
+extern struct timer_data set_tensor_timer;
+
+static inline void start_timer(struct timer_data *timer) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(struct timer_data *timer) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer->total += (timer_end - timer->start);
+  timer->count += 1;
+}
+
+static inline void show_timer(struct timer_data *timer) {
+  double ms = timer->total/1000000;
+  double itl = ms/timer->count;
+  double speed = 1/itl * 1000;
+
+  INFO("%14s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s",
+       timer->name, ms, timer->count, itl, speed);
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
new file mode 100644
index 0000000000000..e67c99a46b5b6
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -0,0 +1,554 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+// needs UNUSED to be defined
+// needs FATAL to be defined
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct vn_cs_encoder {
+  char* cur;
+  const char *start;
+  const char* end;
+};
+
+struct vn_cs_decoder {
+  const char* cur;
+  const char* end;
+};
+
+/*
+ * new encoder and decoder
+ */
+
+static struct vn_cs_decoder
+vn_cs_new_decoder(const char *ptr, size_t size) {
+  struct vn_cs_decoder dec = {
+      .cur = ptr,
+      .end = ptr + size,
+  };
+
+  return dec;
+}
+
+static struct vn_cs_encoder
+vn_cs_new_encoder(char *ptr, size_t size) {
+  struct vn_cs_encoder enc = {
+      .cur = ptr,
+      .start = ptr,
+      .end = ptr + size,
+  };
+
+  return enc;
+}
+
+/*
+ * encode peek
+ */
+
+static inline bool
+vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
+                            size_t size,
+                            void *val,
+                            size_t val_size)
+{
+  assert(val_size <= size);
+
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
+    //vn_cs_decoder_set_fatal(dec);
+    memset(val, 0, val_size);
+    return false;
+  }
+
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(val, dec->cur, val_size);
+  return true;
+}
+
+static inline void
+vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+  vn_cs_decoder_peek_internal(dec, size, val, val_size);
+}
+
+static inline const void *
+vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec,
+			  size_t size)
+{
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
+  }
+  const void *addr = dec->cur;
+  dec->cur += size;
+
+  return addr;
+}
+
+/*
+ * read/write
+ */
+
+static inline void
+vn_cs_decoder_read(struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+  if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
+    dec->cur += size;
+}
+
+static inline char *
+vn_cs_encoder_write(struct vn_cs_encoder *enc,
+                    size_t size,
+                    const void *val,
+                    size_t val_size)
+{
+  assert(val_size <= size);
+  assert(size <= ((size_t) (enc->end - enc->cur)));
+
+  char *write_addr = enc->cur;
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(write_addr, val, val_size);
+  enc->cur += size;
+
+  return write_addr;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void
+vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
+{
+  assert(size % 4 == 0);
+  vn_cs_decoder_read(dec, size, data, data_size);
+}
+
+static inline void
+vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
+{
+  assert(size % 4 == 0);
+  /* TODO check if the generated code is optimal */
+  vn_cs_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint8_t */
+
+static inline void
+vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
+/* uint64_t */
+
+static inline size_t
+vn_sizeof_uint64_t(const uint64_t *val)
+{
+  assert(sizeof(*val) == 8);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 8;
+}
+
+static inline void
+vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
+{
+  vn_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
+{
+  vn_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 8);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+static inline const uint64_t *
+vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count)
+{
+  return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
+/* int32_t */
+
+static inline size_t
+vn_sizeof_int32_t(const int32_t *val)
+{
+  assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 4;
+}
+
+static inline void
+vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
+{
+  vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
+{
+  vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline size_t
+vn_sizeof_array_size(uint64_t size)
+{
+  return vn_sizeof_uint64_t(&size);
+}
+
+static inline void
+vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
+{
+  vn_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t
+vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
+{
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  if (size != expected_size) {
+    FATAL("ENCODER IS FULL :/");
+    //vn_cs_decoder_set_fatal(dec);
+    size = 0;
+  }
+  return size;
+}
+
+static inline uint64_t
+vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
+{
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  return size;
+}
+
+static inline uint64_t
+vn_peek_array_size(struct vn_cs_decoder *dec)
+{
+  uint64_t size;
+  vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
+  return size;
+}
+
+/* non-array pointer */
+
+static inline size_t
+vn_sizeof_simple_pointer(const void *val)
+{
+  return vn_sizeof_array_size(val ? 1 : 0);
+}
+
+static inline bool
+vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
+{
+  vn_encode_array_size(enc, val ? 1 : 0);
+  return val;
+}
+
+static inline bool
+vn_decode_simple_pointer(struct vn_cs_decoder *dec)
+{
+  return vn_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline size_t
+vn_sizeof_uint32_t(const uint32_t *val)
+{
+  assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
+  return 4;
+}
+
+static inline void
+vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
+{
+  vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
+{
+  vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
+{
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
+}
+
+static inline void
+vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
+{
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+/* size_t */
+
+static inline size_t
+vn_sizeof_size_t(const size_t *val)
+{
+    return sizeof(*val);
+}
+
+static inline void
+vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val)
+{
+    const uint64_t tmp = *val;
+    vn_encode_uint64_t(enc, &tmp);
+}
+
+static inline void
+vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val)
+{
+    uint64_t tmp;
+    vn_decode_uint64_t(dec, &tmp);
+    *val = tmp;
+}
+
+static inline size_t
+vn_sizeof_size_t_array(const size_t *val, uint32_t count)
+{
+    return vn_sizeof_size_t(val) * count;
+}
+
+static inline void
+vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_encode_uint64_t_array(enc, (const uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_encode_size_t(enc, &val[i]);
+    }
+}
+
+static inline void
+vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_decode_uint64_t_array(dec, (uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_decode_size_t(dec, &val[i]);
+    }
+}
+
+/* opaque blob */
+
+static inline size_t
+vn_sizeof_blob_array(const void *val, size_t size)
+{
+  UNUSED(val);
+  return (size + 3) & ~3;
+}
+
+static inline void
+vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size)
+{
+  vn_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void
+vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size)
+{
+  vn_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline size_t
+vn_sizeof_char_array(const char *val, size_t size)
+{
+  return vn_sizeof_blob_array(val, size);
+}
+
+static inline void
+vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size)
+{
+  assert(size && strlen(val) < size);
+  vn_encode_blob_array(enc, val, size);
+}
+
+static inline void
+vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size)
+{
+  vn_decode_blob_array(dec, val, size);
+  if (size)
+    val[size - 1] = '\0';
+  else {
+    //vn_cs_decoder_set_fatal(dec);
+    FATAL("Couldn't decode the blog array");
+  }
+}
+
+/* (temp) buffer allocation */
+
+static inline void *
+vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count)
+{
+  UNUSED(dec);
+  size_t alloc_size;
+  if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+    FATAL("overflow in array allocation of %zu * %zu bytes", size, count);
+    return NULL;
+  }
+
+  return malloc(alloc_size);
+}
+
+static inline void *
+vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count)
+{
+  struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec;
+  return vkr_cs_decoder_alloc_array(d, size, count);
+}
+
+/* bool */
+
+static inline void
+vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(bool));
+}
+
+static inline void
+vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(bool));
+}
+
+/* apir_buffer_type_host_handle_t */
+
+static inline void
+vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+static inline void
+vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
+}
+
+/* apir_buffer_host_handle_t */
+
+static inline void
+vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+static inline void
+vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void
+vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val)
+{
+  vn_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val)
+{
+  vn_decode(dec, sizeof(*val), val, sizeof(*val));
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
new file mode 100644
index 0000000000000..196cd70958745
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
@@ -0,0 +1,167 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HANDLE(buffer);
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor->data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
new file mode 100644
index 0000000000000..96402287af7fc
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
@@ -0,0 +1,45 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+
+// ggml_tensor is serialized into rpc_tensor
+struct rpc_tensor {
+  uint64_t id;
+  uint32_t type;
+  uint64_t buffer;
+  uint32_t ne[GGML_MAX_DIMS];
+  uint32_t nb[GGML_MAX_DIMS];
+  uint32_t op;
+  int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+  int32_t  flags;
+  uint64_t src[GGML_MAX_SRC];
+  uint64_t view_src;
+  uint64_t view_offs;
+  uint64_t data;
+  char name[GGML_MAX_NAME];
+
+  char padding[4];
+};
+
+/* frontend */
+
+rpc_tensor serialize_tensor(const ggml_tensor * tensor);
+
+void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void track_backend_buffer(ggml_backend_buffer_t buffer);
+bool untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> get_track_backend_buffers();
+
+void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited);
+
+ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
+
+ggml_tensor *create_node(uint64_t id,
+			 struct ggml_context * ctx,
+			 const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+			 std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
+
+ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes);
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
new file mode 100644
index 0000000000000..71c9b3f3ed820
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -0,0 +1,236 @@
+// needs the ggml-backend-impl.h definition
+// needs venus_cs.h definition
+
+#include "venus_cs_ggml-rpc.h"
+
+// needs
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
+
+static inline void
+vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle);
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
+
+/* rpc_tensor */
+
+static inline void
+vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) {
+  size_t rpc_tensor_size = sizeof(*rpc_tensor);
+  vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size);
+}
+
+static inline rpc_tensor *
+vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor);
+
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
+}
+
+static inline rpc_tensor *
+vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors;
+
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
+}
+
+/* ggml_tensor */
+
+static inline void
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  rpc_tensor serialized = serialize_tensor(tensor);
+
+  vn_encode_rcp_tensor(enc, &serialized);
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor(struct vn_cs_decoder *dec) {
+  const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec);
+  struct ggml_init_params params {
+    /*.mem_size   =*/ ggml_tensor_overhead(),
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+
+  const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor);
+
+  return tensor;
+}
+
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
+
+
+static inline void
+vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) {
+  apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_type_t
+vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return (ggml_backend_buffer_type_t) handle;
+}
+
+static inline apir_buffer_type_host_handle_t
+vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void
+vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) {
+  apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
+}
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_t buffer;
+  size_t buffer_ptr_size = sizeof(buffer);
+
+  vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
+
+  return buffer;
+}
+
+/* enum ggml_status */
+
+static inline void
+vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) {
+  vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status));
+}
+
+static inline void
+vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) {
+  vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
+
+/* vn_renderer_shmem */
+
+static inline void
+vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) {
+  vn_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void
+vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) {
+  vn_decode_uint32_t(dec, shmem_res_id);
+}
+
+/* ggml_cgraph */
+
+static inline size_t
+vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector<uint8_t> & cgraph_data) {
+  serialize_graph(cgraph, cgraph_data);
+
+  return cgraph_data.size();
+}
+
+static inline void
+vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector<uint8_t> & cgraph_data) {
+  size_t cgraph_size = cgraph_data.size();
+
+  vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
+}
+
+static inline ggml_cgraph *
+vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) {
+  UNUSED(cgraph_size);
+
+  uint32_t n_nodes;
+  vn_decode_uint32_t(dec, &n_nodes);
+  const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes);
+
+  uint32_t n_tensors;
+  vn_decode_uint32_t(dec, &n_tensors);
+  const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors);
+
+  return deserialize_graph(n_nodes, n_tensors, tensors, nodes);
+}
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void
+vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  size_t tensor_size = sizeof(*tensor);
+
+  if (tensor->extra) {
+    FATAL("Cannot pass tensors with extra");
+  }
+
+  if (tensor->src[0] && tensor->buffer) {
+    static int first = 1;
+    if (first) {
+      // not sure if the buffer needs to be updated inside the src tensors or not
+      WARNING("Cannot pass tensors with src and buffer");
+      first = 0;
+    }
+  }
+
+  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+  // (could also make a copy of the tensor, and update locally.)
+
+  if (tensor->buffer) {
+    apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+  }
+
+  if (tensor->view_src) {
+    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    const ggml_tensor *tensor_src = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+  }
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+  if (tensor->buffer) {
+    tensor->buffer = vn_decode_ggml_buffer(dec);
+  }
+
+  if (tensor->view_src) {
+    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->view_src = tensor_view_src;
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+  }
+
+  return tensor;
+}
diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
new file mode 100644
index 0000000000000..30ae511aa95e8
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
@@ -0,0 +1,118 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "shared/venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+bool
+untrack_backend_buffer(ggml_backend_buffer_t buffer) {
+  auto it = backend_buffers.find(buffer);
+  if (it == backend_buffers.end()) {
+    return false;
+  }
+
+  backend_buffers.erase(it);
+  return true;
+}
+
+std::unordered_set<ggml_backend_buffer_t>
+get_track_backend_buffers() {
+  return backend_buffers;
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  uint64_t tensor_data = tensor->data;
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+
+    // tensor->data is serialized as an offset to the buffer base address
+    tensor_data += buffer_start;
+
+    GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow
+    GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor_data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
new file mode 100644
index 0000000000000..f3f3dea652cf9
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting frontend")
+
+ggml_add_backend_library(ggml-remotingfrontend
+                         ggml-backend-buffer.cpp
+                         ggml-backend.cpp
+                         ggml-backend-device.cpp
+                         ggml-backend-reg.cpp
+                         ggml-backend-buffer-type.cpp
+                         ggml-backend-host-buffer-type.cpp
+                         virtgpu.cpp
+                         virtgpu-shm.cpp
+                         virtgpu-utils.cpp
+                         virtgpu-forward-device.cpp
+                         virtgpu-forward-buffer-type.cpp
+                         virtgpu-forward-buffer.cpp
+                         virtgpu-forward-backend.cpp
+                         virtgpu-forward-impl.h
+                         ../../include/ggml-remoting-frontend.h
+                         venus_cs_ggml-rpc-front.cpp
+                        )
+
+# dnf install -y libdrm-devel
+target_link_libraries(ggml-remotingfrontend PUBLIC drm)
+target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/)
+target_include_directories(ggml-remotingfrontend PUBLIC ./include)
+
+target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
new file mode 100644
index 0000000000000..b655b8018f80d
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -0,0 +1,98 @@
+#include "ggml-remoting.h"
+
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+
+  const int USE_FROM_PTR = true;
+
+  if (USE_FROM_PTR) {
+    context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+    context->base = context->apir_context.shmem->mmap_ptr;
+    context->is_from_ptr = true;
+  } else {
+    context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+    context->is_from_ptr = false;
+    context->base = NULL;
+  }
+  context->is_host_buffer = false;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+
+  return buffer;
+}
+
+static const char *
+ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  return apir_buffer_type_get_name(gpu, buft);
+}
+
+static size_t
+ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  static size_t align = 0;
+
+  if (align == 0) {
+    align = apir_buffer_type_get_alignment(gpu, buft);
+  }
+
+  return align;
+}
+
+static size_t
+ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  static size_t max_size = 0;
+  if (max_size == 0) {
+    max_size = apir_buffer_type_get_max_size(gpu, buft);
+  }
+
+  return max_size;
+}
+
+static bool
+ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  return apir_buffer_type_is_host(gpu, buft);
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ NULL,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
+/****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
new file mode 100644
index 0000000000000..e720efcf47c69
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -0,0 +1,167 @@
+#include "ggml-remoting.h"
+
+#define BUFFER_TO_GPU(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
+
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
+struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"};
+struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"};
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+  IMPLEMENTED_ONCE;
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
+  if (context->base) {
+    return context->base;
+  }
+
+  context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer),
+				       BUFFER_TO_APIR_CONTEXT(buffer));
+
+  return context->base;
+}
+
+static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+  NOT_IMPLEMENTED;
+
+  STOP_HERE;
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(value);
+  UNUSED(offset);
+  UNUSED(size);
+}
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_timer);
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+#if 0
+  INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size);
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t)data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy((char *)tensor->data + offset, data, size);
+  } else {
+    apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
+
+  stop_timer(&set_tensor_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&get_tensor_timer);
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+  } else {
+    apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
+
+  stop_timer(&get_tensor_timer);
+}
+
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_from_ptr_timer);
+
+  UNUSED(buffer);
+
+  memcpy((char *)tensor->data + offset, data, size);
+
+  stop_timer(&set_tensor_from_ptr_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  UNUSED(buffer);
+
+  start_timer(&get_tensor_from_ptr_timer);
+
+  memcpy(data, (const char *)tensor->data + offset, size);
+
+  stop_timer(&get_tensor_from_ptr_timer);
+}
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  NOT_IMPLEMENTED;
+
+  STOP_HERE;
+
+  return true;
+
+  UNUSED(buffer);
+  UNUSED(src);
+  UNUSED(dst);
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  UNUSED(buffer);
+
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  free(context);
+  buffer->context = NULL;
+}
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
new file mode 100644
index 0000000000000..6f498d0edc2e4
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -0,0 +1,201 @@
+#include "ggml-remoting.h"
+
+static const char *
+ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_name(gpu);
+}
+
+static const char *
+ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_description(gpu);
+}
+
+static enum ggml_backend_dev_type
+ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  static enum ggml_backend_dev_type type;
+  static bool has_type = false;
+  if (!has_type) {
+    has_type = true;
+    type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+  }
+
+  return type;
+}
+
+static void
+ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_get_memory(gpu, free, total);
+}
+
+static bool
+ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  return apir_device_supports_op(gpu, op);
+}
+
+static bool
+ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+  //IMPLEMENTED_ONCE;
+
+#if 1
+  bool supported = buft->device == dev;
+  if (!supported) {
+    //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft));
+  }
+
+  return supported;
+#else
+  UNUSED(dev);
+  UNUSED(buft);
+
+  return true;
+#endif
+}
+
+static bool
+ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+  //IMPLEMENTED_ONCE;
+
+  UNUSED(dev);
+  UNUSED(op);
+
+  // related to supports_buft, need to confirm
+
+  return false; // same as ggml-metal
+}
+
+static void
+ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+  IMPLEMENTED;
+
+  props->name        = ggml_backend_remoting_device_get_name(dev);
+  props->description = ggml_backend_remoting_device_get_description(dev);
+  props->type        = ggml_backend_remoting_device_get_type(dev);
+  ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+
+#if 0
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+  apir_device_get_props(gpu,
+			&props->caps.async,
+			&props->caps.host_buffer,
+			&props->caps.buffer_from_host_ptr,
+			&props->caps.events
+    );
+#else
+  // ignore the actual backend answers and set it as we provide it in
+  // the API Remoting frontend
+  props->caps.async = false;
+  props->caps.host_buffer = false;
+  props->caps.buffer_from_host_ptr = false;
+  props->caps.events = false;
+#endif
+
+  INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
+    __func__, props->caps.async, props->caps.host_buffer,
+       props->caps.buffer_from_host_ptr, props->caps.events);
+}
+
+ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
+}
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+  context->base = ptr;
+  context->is_from_ptr = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+  INFO("#");
+  INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer);
+  INFO("#\n");
+
+  return buffer;
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  static struct ggml_backend_buffer_type host_bufft = {
+    /* .iface    = */ ggml_backend_remoting_host_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ nullptr,
+  };
+
+  return &host_bufft;
+}
+
+const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
+  /* .get_name             = */ ggml_backend_remoting_device_get_name,
+  /* .get_description      = */ ggml_backend_remoting_device_get_description,
+  /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+  /* .get_type             = */ ggml_backend_remoting_device_get_type,
+  /* .get_props            = */ ggml_backend_remoting_device_get_props,
+  /* .init_backend         = */ ggml_backend_remoting_device_init,
+  /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+  /* .get_host_buffer_type = */ NULL,
+  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+  /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+  /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+  /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+  /* .event_new            = */ NULL,
+  /* .event_free           = */ NULL,
+  /* .event_synchronize    = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
new file mode 100644
index 0000000000000..c09c80d6472f5
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -0,0 +1,110 @@
+#include "ggml-remoting.h"
+
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+
+static void
+ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  BEING_IMPLEMENTED;
+
+  void *ptr = buffer->context;
+
+  if (ptr == nullptr) {
+        return;
+  }
+  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
+
+  struct vn_renderer_shmem *shmem = nullptr;
+  size_t index;
+
+  for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) {
+    const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */;
+    const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */;
+    if (ptr >= addr && ptr < endr) {
+      shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */;
+      index = i;
+      break;
+    }
+  }
+
+  if (shmem == nullptr) {
+    WARNING("failed to free host shared memory: memory not in map\n");
+    return;
+  }
+
+  virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem);
+
+  device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index);
+}
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+  context->base = context->apir_context.shmem->mmap_ptr;
+  context->is_host_buffer = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+  INFO("##");
+  INFO("## %s(%llx) --> %p <======================", __func__, size, buffer);
+  INFO("##\n");
+
+  return buffer;
+}
+
+static const char *
+ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return "GUEST host buffer";
+}
+
+static size_t
+ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return 64; // not 100% sure ...
+}
+
+static bool
+ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED_ONCE;
+
+  return true;
+}
+
+static size_t
+ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED;
+  STOP_HERE;
+
+  return SIZE_MAX;
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_host_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_host_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+    /* .is_host          = */ ggml_backend_remoting_host_buffer_type_is_host,
+  };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
new file mode 100644
index 0000000000000..d0132370d9f91
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -0,0 +1,148 @@
+#include <mutex>
+#include <iostream>
+
+#include "ggml-remoting.h"
+
+static struct virtgpu *apir_initialize() {
+  static struct virtgpu *apir_gpu_instance = NULL;
+  static bool apir_initialized = false;
+
+  if (apir_initialized) {
+    return apir_gpu_instance;
+  }
+  apir_initialized = true;
+
+  apir_gpu_instance = create_virtgpu();
+  if (!apir_gpu_instance) {
+    FATAL("failed to initialize the virtgpu :/");
+    return NULL;
+  }
+
+  apir_initialized = true;
+
+  return apir_gpu_instance;
+}
+
+static int ggml_backend_remoting_get_device_count() {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    WARNING("apir_initialize failed :/");
+    return 0;
+  }
+
+  return apir_device_get_count(gpu);
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+
+  IMPLEMENTED;
+
+  return ggml_backend_remoting_get_device_count();
+}
+
+static std::vector<ggml_backend_dev_t> devices;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
+  GGML_ASSERT(device < devices.size());
+  return devices[device];
+}
+
+static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
+  IMPLEMENTED;
+
+  if (devices.size() > 0) {
+    INFO("%s: already initialized", __func__);
+  }
+
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    FATAL("apir_initialize failed :/");
+    return;
+  }
+
+  static bool initialized = false;
+
+  {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!initialized) {
+
+      for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
+        ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context;
+        char desc[256] = "API Remoting device";
+
+        ctx->device = i;
+        ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
+        ctx->description = desc;
+	ctx->gpu = gpu;
+
+        devices.push_back(new ggml_backend_device {
+            /* .iface   = */ ggml_backend_remoting_device_interface,
+            /* .reg     = */ reg,
+            /* .context = */ ctx,
+          });
+      }
+      initialized = true;
+    }
+  }
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+  UNUSED(reg);
+
+  IMPLEMENTED;
+
+  return ggml_backend_remoting_get_device(device);
+}
+
+static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+
+  return GGML_REMOTING_FRONTEND_NAME;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+  /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+  /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+  /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+  /* .get_proc_address = */ NULL,
+};
+
+
+static void showTime() {
+  show_timer(&graph_compute_timer);
+  show_timer(&get_tensor_timer);
+  show_timer(&set_tensor_timer);
+}
+
+ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    FATAL("apir_initialize failed :/");
+    return NULL;
+  }
+
+  static ggml_backend_reg reg = {
+    /* .api_version = */ GGML_BACKEND_API_VERSION,
+    /* .iface       = */ ggml_backend_remoting_reg_i,
+    /* .context     = */ gpu,
+  };
+
+  static bool initialized = false;
+  if (initialized) {
+    return &reg;
+  }
+  initialized = true;
+
+  INFO("ggml_backend_remoting_frontend_reg() hello :wave:");
+
+  ggml_backend_remoting_reg_init_devices(&reg);
+
+  int cr = atexit(showTime);
+  assert(cr == 0);
+
+  return &reg;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
new file mode 100644
index 0000000000000..14f95ec88ff02
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -0,0 +1,70 @@
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
+  UNUSED(backend);
+
+  //IMPLEMENTED_ONCE;
+
+  return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+  IMPLEMENTED;
+
+  delete backend;
+}
+
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+  struct virtgpu *gpu = DEV_TO_GPU(backend->device);
+
+  IMPLEMENTED_ONCE;
+
+  start_timer(&graph_compute_timer);
+
+  ggml_status status = apir_backend_graph_compute(gpu, cgraph);
+
+  stop_timer(&graph_compute_timer);
+
+  return status;
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+  /* .get_name                = */ ggml_backend_remoting_get_name,
+  /* .free                    = */ ggml_backend_remoting_free,
+  /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+  /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+  /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+  /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+  /* .graph_plan_create       = */ NULL,
+  /* .graph_plan_free         = */ NULL,
+  /* .graph_plan_update       = */ NULL,
+  /* .graph_plan_compute      = */ NULL,
+  /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+  /* .event_record            = */ NULL,
+  /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+  static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+
+  return &guid;
+}
+
+
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+  UNUSED(params);
+  IMPLEMENTED;
+
+  ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
+
+  ggml_backend_t remoting_backend = new ggml_backend {
+    /* .guid      = */ ggml_backend_remoting_guid(),
+    /* .interface = */ ggml_backend_remoting_interface,
+    /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device),
+    /* .context   = */ ctx,
+  };
+
+  return remoting_backend;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
new file mode 100644
index 0000000000000..87679fe59a8d3
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
@@ -0,0 +1,26 @@
+#include <ostream>
+#include <iostream>
+#include <mutex>
+#include <memory>
+#include <chrono>
+#include <thread>
+#include <unistd.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+
+#include "ggml-remoting-frontend.h"
+#include "remoting.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+
+
+int ggml_backend_remoting_get_device_count();
+
+
+
+
+struct remoting_device_struct {
+    std::mutex mutex;
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
new file mode 100644
index 0000000000000..cd58ed674475d
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <string>
+#include <memory>
+
+#include "ggml-remoting-frontend.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "virtgpu.h"
+
+#define DEV_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
+
+#define BUFFER_TO_GGML_CONTEXT(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)
+
+#define BUFFER_TO_APIR_CONTEXT(name) \
+  &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
+
+#define BUFFER_TO_HOST_HANDLE(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle
+
+#define GET_DEVICE_CONTEXT() \
+  (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft->context;
+}
+
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
+
+#define BEING_IMPLEMENTED							\
+  do {									\
+      printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \
+  } while(0)
+
+#define NEXT
+
+#define STOP_HERE \
+  thks_bye()
+
+#define BREAKPOINT \
+  breakpoint()
+
+#ifndef NDEBUG
+#define IMPLEMENTED							\
+  printf("INFO: ### reached implemented function %s\n", __func__)
+#else
+#define IMPLEMENTED							\
+  do {} while(0)
+#endif
+
+#ifndef NDEBUG
+#define IMPLEMENTED_ONCE						\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("INFO: ### reached implemented function %s\n", __func__);  \
+      first = false;							\
+    }									\
+  } while(0)
+#else
+#define IMPLEMENTED_ONCE			\
+  do {} while(0)
+#endif
+
+#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
+
+struct ggml_backend_remoting_device_context {
+  size_t device;
+  std::string name;
+  std::string description;
+
+  std::vector<std::tuple<void*, size_t, struct vn_renderer_shmem *>> shared_memory;
+
+  struct virtgpu *gpu;
+};
+
+struct ggml_backend_remoting_buffer_context {
+  apir_buffer_context_t apir_context;
+
+  struct virtgpu *gpu;
+
+  void *base;
+
+  bool is_host_buffer;
+  bool is_from_ptr;
+};
+
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
+extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface;
+
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device);
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+
+struct remoting_buffer_struct;
+typedef std::shared_ptr<remoting_buffer_struct> remoting_buffer;
+typedef std::weak_ptr<remoting_buffer_struct> remoting_buffer_ref;
+
+void ggml_remoting_destroy_buffer(remoting_buffer& buf);
+
+struct remoting_device_struct;
+typedef std::shared_ptr<remoting_device_struct> remoting_device;
+typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
+
+struct remoting_context_struct {
+  int i;
+};
+typedef std::shared_ptr<remoting_context_struct> remoting_context;
+typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  return BUFFER_TO_HOST_HANDLE(buffer);
+}
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
new file mode 100644
index 0000000000000..4e4f7c2c39e4f
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
@@ -0,0 +1,1408 @@
+/*
+ * Header for the Direct Rendering Manager
+ *
+ * Author: Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if   defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <stdint.h>
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/*
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/*
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/*
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/*
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/*
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char *desc;	  /**< User-space buffer to hold desc */
+};
+
+/*
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/*
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/*
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/*
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/*
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/*
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/*
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/*
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/*
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/*
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/*
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/*
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc *list;
+};
+
+/*
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int *list;
+};
+
+/*
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void *address;	       /**< Address of buffer */
+};
+
+/*
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void *virt;
+#else
+	void *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub *list;	/**< Buffer information */
+};
+
+/*
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int *send_indices;	  /**< List of handles to buffers */
+	int *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int *request_indices;	  /**< Buffer information */
+	int *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/*
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/*
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx *contexts;
+};
+
+/*
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/*
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/*
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/*
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/*
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/*
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/*
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/*
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/*
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/*
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/*
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/*
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/* DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/* DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/* DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+/**
+ * DRM_CAP_DUMB_BUFFER
+ *
+ * If set to 1, the driver supports creating dumb buffers via the
+ * &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
+ */
+#define DRM_CAP_DUMB_BUFFER		0x1
+/**
+ * DRM_CAP_VBLANK_HIGH_CRTC
+ *
+ * If set to 1, the kernel supports specifying a :ref:`CRTC index<crtc_index>`
+ * in the high bits of &drm_wait_vblank_request.type.
+ *
+ * Starting kernel version 2.6.39, this capability is always set to 1.
+ */
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+/**
+ * DRM_CAP_DUMB_PREFERRED_DEPTH
+ *
+ * The preferred bit depth for dumb buffers.
+ *
+ * The bit depth is the number of bits used to indicate the color of a single
+ * pixel excluding any padding. This is different from the number of bits per
+ * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
+ * pixel.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+/**
+ * DRM_CAP_DUMB_PREFER_SHADOW
+ *
+ * If set to 1, the driver prefers userspace to render to a shadow buffer
+ * instead of directly rendering to a dumb buffer. For best speed, userspace
+ * should do streaming ordered memory copies into the dumb buffer and never
+ * read from it.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+/**
+ * DRM_CAP_PRIME
+ *
+ * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
+ * and &DRM_PRIME_CAP_EXPORT.
+ *
+ * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and
+ * &DRM_PRIME_CAP_EXPORT are always advertised.
+ *
+ * PRIME buffers are exposed as dma-buf file descriptors.
+ * See :ref:`prime_buffer_sharing`.
+ */
+#define DRM_CAP_PRIME			0x5
+/**
+ * DRM_PRIME_CAP_IMPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
+ * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_IMPORT		0x1
+/**
+ * DRM_PRIME_CAP_EXPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
+ * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_EXPORT		0x2
+/**
+ * DRM_CAP_TIMESTAMP_MONOTONIC
+ *
+ * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
+ * struct drm_event_vblank. If set to 1, the kernel will report timestamps with
+ * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
+ * clocks.
+ *
+ * Starting from kernel version 2.6.39, the default value for this capability
+ * is 1. Starting kernel version 4.15, this capability is always set to 1.
+ */
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+/**
+ * DRM_CAP_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy
+ * page-flips.
+ */
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/**
+ * DRM_CAP_CURSOR_WIDTH
+ *
+ * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
+ * width x height combination for the hardware cursor. The intention is that a
+ * hardware agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+/**
+ * DRM_CAP_CURSOR_HEIGHT
+ *
+ * See &DRM_CAP_CURSOR_WIDTH.
+ */
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+/**
+ * DRM_CAP_ADDFB2_MODIFIERS
+ *
+ * If set to 1, the driver supports supplying modifiers in the
+ * &DRM_IOCTL_MODE_ADDFB2 ioctl.
+ */
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+/**
+ * DRM_CAP_PAGE_FLIP_TARGET
+ *
+ * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
+ * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
+ * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
+ * ioctl.
+ */
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+/**
+ * DRM_CAP_CRTC_IN_VBLANK_EVENT
+ *
+ * If set to 1, the kernel supports reporting the CRTC ID in
+ * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
+ * &DRM_EVENT_FLIP_COMPLETE events.
+ *
+ * Starting kernel version 4.12, this capability is always set to 1.
+ */
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+/**
+ * DRM_CAP_SYNCOBJ
+ *
+ * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ		0x13
+/**
+ * DRM_CAP_SYNCOBJ_TIMELINE
+ *
+ * If set to 1, the driver supports timeline operations on sync objects. See
+ * :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ_TIMELINE	0x14
+/**
+ * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic
+ * commits.
+ */
+#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP	0x15
+
+/* DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * If set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 3.13.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ *
+ * This capability has been introduced in kernel version 3.15. Starting from
+ * kernel version 3.17, this capability is always supported for all drivers.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace. This
+ * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and
+ * &DRM_CLIENT_CAP_ASPECT_RATIO.
+ *
+ * If the driver doesn't support atomic mode-setting, enabling this capability
+ * will fail with -EOPNOTSUPP.
+ *
+ * This capability has been introduced in kernel version 4.0. Starting from
+ * kernel version 4.2, this capability is always supported for atomic-capable
+ * drivers.
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/**
+ * DRM_CLIENT_CAP_ASPECT_RATIO
+ *
+ * If set to 1, the DRM core will provide aspect ratio information in modes.
+ * See ``DRM_MODE_FLAG_PIC_AR_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 4.18.
+ */
+#define DRM_CLIENT_CAP_ASPECT_RATIO    4
+
+/**
+ * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS
+ *
+ * If set to 1, the DRM core will expose special connectors to be used for
+ * writing back to memory the scene setup in the commit. The client must enable
+ * &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * This capability is always supported for atomic-capable drivers starting from
+ * kernel version 4.19.
+ */
+#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS	5
+
+/**
+ * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT
+ *
+ * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and
+ * virtualbox) have additional restrictions for cursor planes (thus
+ * making cursor planes on those drivers not truly universal,) e.g.
+ * they need cursor planes to act like one would expect from a mouse
+ * cursor and have correctly set hotspot properties.
+ * If this client cap is not set the DRM core will hide cursor plane on
+ * those virtualized drivers because not setting it implies that the
+ * client is not capable of dealing with those extra restictions.
+ * Clients which do set cursor hotspot and treat the cursor plane
+ * like a mouse cursor should set this property.
+ * The client must enable &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * Setting this property on drivers which do not special case
+ * cursor planes (i.e. non-virtualized drivers) will return
+ * EOPNOTSUPP, which can be used by userspace to gauge
+ * requirements of the hardware/drivers they're running on.
+ *
+ * This capability is always supported for atomic-capable virtualized
+ * drivers starting from kernel version 6.6.
+ */
+#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT	6
+
+/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+struct drm_syncobj_transfer {
+	__u32 src_handle;
+	__u32 dst_handle;
+	__u64 src_point;
+	__u64 dst_point;
+	__u32 flags;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */
+struct drm_syncobj_wait {
+	__u64 handles;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+struct drm_syncobj_timeline_wait {
+	__u64 handles;
+	/* wait on specific timeline point for every handles*/
+	__u64 points;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+/**
+ * struct drm_syncobj_eventfd
+ * @handle: syncobj handle.
+ * @flags: Zero to wait for the point to be signalled, or
+ *         &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be
+ *         available for the point.
+ * @point: syncobj timeline point (set to zero for binary syncobjs).
+ * @fd: Existing eventfd to sent events to.
+ * @pad: Must be zero.
+ *
+ * Register an eventfd to be signalled by a syncobj. The eventfd counter will
+ * be incremented by one.
+ */
+struct drm_syncobj_eventfd {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+	__s32 fd;
+	__u32 pad;
+};
+
+
+struct drm_syncobj_array {
+	__u64 handles;
+	__u32 count_handles;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */
+struct drm_syncobj_timeline_array {
+	__u64 handles;
+	__u64 points;
+	__u32 count_handles;
+	__u32 flags;
+};
+
+
+/* Query current scanout sequence number */
+struct drm_crtc_get_sequence {
+	__u32 crtc_id;		/* requested crtc_id */
+	__u32 active;		/* return: crtc output is active */
+	__u64 sequence;		/* return: most recent vblank sequence */
+	__s64 sequence_ns;	/* return: most recent time of first pixel out */
+};
+
+/* Queue event to be delivered at specified sequence. Time stamp marks
+ * when the first pixel of the refresh cycle leaves the display engine
+ * for the display
+ */
+#define DRM_CRTC_SEQUENCE_RELATIVE		0x00000001	/* sequence is relative to current */
+#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS		0x00000002	/* Use next sequence if we've missed */
+
+struct drm_crtc_queue_sequence {
+	__u32 crtc_id;
+	__u32 flags;
+	__u64 sequence;		/* on input, target sequence. on output, actual sequence */
+	__u64 user_data;	/* user data passed to event */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_CRTC_GET_SEQUENCE	DRM_IOWR(0x3b, struct drm_crtc_get_sequence)
+#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE	DRM_IOWR(0x3c, struct drm_crtc_queue_sequence)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+/**
+ * DRM_IOCTL_MODE_RMFB - Remove a framebuffer.
+ *
+ * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * Warning: removing a framebuffer currently in-use on an enabled plane will
+ * disable that plane. The CRTC the plane is linked to may also be disabled
+ * (depending on driver capabilities).
+ */
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+/**
+ * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object.
+ *
+ * KMS dumb buffers provide a very primitive way to allocate a buffer object
+ * suitable for scanout and map it for software rendering. KMS dumb buffers are
+ * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb
+ * buffers are not suitable to be displayed on any other device than the KMS
+ * device where they were allocated from. Also see
+ * :ref:`kms_dumb_buffer_objects`.
+ *
+ * The IOCTL argument is a struct drm_mode_create_dumb.
+ *
+ * User-space is expected to create a KMS dumb buffer via this IOCTL, then add
+ * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via
+ * &DRM_IOCTL_MODE_MAP_DUMB.
+ *
+ * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported.
+ * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate
+ * driver preferences for dumb buffers.
+ */
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_WAIT		DRM_IOWR(0xC3, struct drm_syncobj_wait)
+#define DRM_IOCTL_SYNCOBJ_RESET		DRM_IOWR(0xC4, struct drm_syncobj_array)
+#define DRM_IOCTL_SYNCOBJ_SIGNAL	DRM_IOWR(0xC5, struct drm_syncobj_array)
+
+#define DRM_IOCTL_MODE_CREATE_LEASE	DRM_IOWR(0xC6, struct drm_mode_create_lease)
+#define DRM_IOCTL_MODE_LIST_LESSEES	DRM_IOWR(0xC7, struct drm_mode_list_lessees)
+#define DRM_IOCTL_MODE_GET_LEASE	DRM_IOWR(0xC8, struct drm_mode_get_lease)
+#define DRM_IOCTL_MODE_REVOKE_LEASE	DRM_IOWR(0xC9, struct drm_mode_revoke_lease)
+
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT	DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait)
+#define DRM_IOCTL_SYNCOBJ_QUERY		DRM_IOWR(0xCB, struct drm_syncobj_timeline_array)
+#define DRM_IOCTL_SYNCOBJ_TRANSFER	DRM_IOWR(0xCC, struct drm_syncobj_transfer)
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL	DRM_IOWR(0xCD, struct drm_syncobj_timeline_array)
+
+/**
+ * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata.
+ *
+ * This queries metadata about a framebuffer. User-space fills
+ * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the
+ * struct as the output.
+ *
+ * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
+ *
+ * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
+ * until one has a zero &drm_mode_fb_cmd2.pitches.
+ *
+ * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
+ * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
+ * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
+ */
+#define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
+
+#define DRM_IOCTL_SYNCOBJ_EVENTFD	DRM_IOWR(0xCF, struct drm_syncobj_eventfd)
+
+/**
+ * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer.
+ *
+ * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable
+ * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept
+ * alive. When the plane no longer uses the framebuffer (because the
+ * framebuffer is replaced with another one, or the plane is disabled), the
+ * framebuffer is cleaned up.
+ *
+ * This is useful to implement flicker-free transitions between two processes.
+ *
+ * Depending on the threat model, user-space may want to ensure that the
+ * framebuffer doesn't expose any sensitive user information: closed
+ * framebuffers attached to a plane can be read back by the next DRM master.
+ */
+#define DRM_IOCTL_MODE_CLOSEFB		DRM_IOWR(0xD0, struct drm_mode_closefb)
+
+/*
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * struct drm_event - Header for DRM events
+ * @type: event type.
+ * @length: total number of payload bytes (including header).
+ *
+ * This struct is a header for events written back to user-space on the DRM FD.
+ * A read on the DRM FD will always only return complete events: e.g. if the
+ * read buffer is 100 bytes large and there are two 64 byte events pending,
+ * only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and
+ * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK,
+ * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+/**
+ * DRM_EVENT_VBLANK - vertical blanking event
+ *
+ * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the
+ * &_DRM_VBLANK_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_VBLANK 0x01
+/**
+ * DRM_EVENT_FLIP_COMPLETE - page-flip completion event
+ *
+ * This event is sent in response to an atomic commit or legacy page-flip with
+ * the &DRM_MODE_PAGE_FLIP_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+/**
+ * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event
+ *
+ * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE.
+ *
+ * The event payload is a struct drm_event_crtc_sequence.
+ */
+#define DRM_EVENT_CRTC_SEQUENCE	0x03
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* Event delivered at sequence. Time stamp marks when the first pixel
+ * of the refresh cycle leaves the display engine for the display
+ */
+struct drm_event_crtc_sequence {
+	struct drm_event	base;
+	__u64			user_data;
+	__s64			time_ns;
+	__u64			sequence;
+};
+
+/* typedef area */
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
new file mode 100644
index 0000000000000..9debb320c34be
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2013 Red Hat
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRTGPU_DRM_H
+#define VIRTGPU_DRM_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ *
+ * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel
+ * compatibility Keep fields aligned to their size
+ */
+
+#define DRM_VIRTGPU_MAP         0x01
+#define DRM_VIRTGPU_EXECBUFFER  0x02
+#define DRM_VIRTGPU_GETPARAM    0x03
+#define DRM_VIRTGPU_RESOURCE_CREATE 0x04
+#define DRM_VIRTGPU_RESOURCE_INFO     0x05
+#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
+#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
+#define DRM_VIRTGPU_WAIT     0x08
+#define DRM_VIRTGPU_GET_CAPS  0x09
+#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a
+#define DRM_VIRTGPU_CONTEXT_INIT 0x0b
+
+#define VIRTGPU_EXECBUF_FENCE_FD_IN	0x01
+#define VIRTGPU_EXECBUF_FENCE_FD_OUT	0x02
+#define VIRTGPU_EXECBUF_RING_IDX	0x04
+#define VIRTGPU_EXECBUF_FLAGS  (\
+		VIRTGPU_EXECBUF_FENCE_FD_IN |\
+		VIRTGPU_EXECBUF_FENCE_FD_OUT |\
+		VIRTGPU_EXECBUF_RING_IDX |\
+		0)
+
+struct drm_virtgpu_map {
+	__u64 offset; /* use for mmap system call */
+	__u32 handle;
+	__u32 pad;
+};
+
+#define VIRTGPU_EXECBUF_SYNCOBJ_RESET		0x01
+#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \
+		VIRTGPU_EXECBUF_SYNCOBJ_RESET | \
+		0)
+struct drm_virtgpu_execbuffer_syncobj {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+};
+
+/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */
+struct drm_virtgpu_execbuffer {
+	__u32 flags;
+	__u32 size;
+	__u64 command; /* void* */
+	__u64 bo_handles;
+	__u32 num_bo_handles;
+	__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
+	__u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */
+	__u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */
+	__u32 num_in_syncobjs;
+	__u32 num_out_syncobjs;
+	__u64 in_syncobjs;
+	__u64 out_syncobjs;
+};
+
+#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
+#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */
+#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */
+#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */
+#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing  */
+#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */
+#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */
+#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */
+
+struct drm_virtgpu_getparam {
+	__u64 param;
+	__u64 value;
+};
+
+/* NO_BO flags? NO resource flag? */
+/* resource flag for y_0_top */
+struct drm_virtgpu_resource_create {
+	__u32 target;
+	__u32 format;
+	__u32 bind;
+	__u32 width;
+	__u32 height;
+	__u32 depth;
+	__u32 array_size;
+	__u32 last_level;
+	__u32 nr_samples;
+	__u32 flags;
+	__u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */
+	__u32 res_handle;  /* returned by kernel */
+	__u32 size;        /* validate transfer in the host */
+	__u32 stride;      /* validate transfer in the host */
+};
+
+struct drm_virtgpu_resource_info {
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u32 size;
+	__u32 blob_mem;
+};
+
+struct drm_virtgpu_3d_box {
+	__u32 x;
+	__u32 y;
+	__u32 z;
+	__u32 w;
+	__u32 h;
+	__u32 d;
+};
+
+struct drm_virtgpu_3d_transfer_to_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+struct drm_virtgpu_3d_transfer_from_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+#define VIRTGPU_WAIT_NOWAIT 1 /* like it */
+struct drm_virtgpu_3d_wait {
+	__u32 handle; /* 0 is an invalid handle */
+	__u32 flags;
+};
+
+#define VIRTGPU_DRM_CAPSET_VIRGL 1
+#define VIRTGPU_DRM_CAPSET_VIRGL2 2
+#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3
+#define VIRTGPU_DRM_CAPSET_VENUS 4
+#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5
+#define VIRTGPU_DRM_CAPSET_DRM 6
+struct drm_virtgpu_get_caps {
+	__u32 cap_set_id;
+	__u32 cap_set_ver;
+	__u64 addr;
+	__u32 size;
+	__u32 pad;
+};
+
+struct drm_virtgpu_resource_create_blob {
+#define VIRTGPU_BLOB_MEM_GUEST             0x0001
+#define VIRTGPU_BLOB_MEM_HOST3D            0x0002
+#define VIRTGPU_BLOB_MEM_HOST3D_GUEST      0x0003
+
+#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE     0x0001
+#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE    0x0002
+#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004
+	/* zero is invalid blob_mem */
+	__u32 blob_mem;
+	__u32 blob_flags;
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u64 size;
+
+	/*
+	 * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and
+	 * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero.
+	 */
+	__u32 pad;
+	__u32 cmd_size;
+	__u64 cmd;
+	__u64 blob_id;
+};
+
+#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID       0x0001
+#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS       0x0002
+#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003
+#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME      0x0004
+struct drm_virtgpu_context_set_param {
+	__u64 param;
+	__u64 value;
+};
+
+struct drm_virtgpu_context_init {
+	__u32 num_params;
+	__u32 pad;
+
+	/* pointer to drm_virtgpu_context_set_param array */
+	__u64 ctx_set_params;
+};
+
+/*
+ * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in
+ * effect.  The event size is sizeof(drm_event), since there is no additional
+ * payload.
+ */
+#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000
+
+#define DRM_IOCTL_VIRTGPU_MAP \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
+
+#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
+		struct drm_virtgpu_execbuffer)
+
+#define DRM_IOCTL_VIRTGPU_GETPARAM \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\
+		struct drm_virtgpu_getparam)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE			\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE,	\
+		struct drm_virtgpu_resource_create)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \
+		 struct drm_virtgpu_resource_info)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST,	\
+		struct drm_virtgpu_3d_transfer_from_host)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST,	\
+		struct drm_virtgpu_3d_transfer_to_host)
+
+#define DRM_IOCTL_VIRTGPU_WAIT				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT,	\
+		struct drm_virtgpu_3d_wait)
+
+#define DRM_IOCTL_VIRTGPU_GET_CAPS \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \
+	struct drm_virtgpu_get_caps)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB,	\
+		struct drm_virtgpu_resource_create_blob)
+
+#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT					\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT,		\
+		struct drm_virtgpu_context_init)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
new file mode 100644
index 0000000000000..3ef774b8259d3
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2020 Chromium
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VENUS_HW_H
+#define VENUS_HW_H
+
+#include <stdint.h>
+
+struct virgl_renderer_capset_venus {
+   uint32_t wire_format_version;
+   uint32_t vk_xml_version;
+   uint32_t vk_ext_command_serialization_spec_version;
+   uint32_t vk_mesa_venus_protocol_spec_version;
+
+   /* This flag indicates render server config, and will be needed until drm
+    * virtio-gpu blob mem gets fixed to attach_resource before resource_map.
+    */
+   uint32_t supports_blob_id_0;
+
+   /* Extension number N, where N is defined by the Vulkan spec, corresponds
+    * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023
+    * Vulkan extensions (numbered from 1 to 1023).
+    *
+    * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When
+    * that bit is set, the extension mask(s) are valid. Otherwise, all the
+    * extensions are assumed to be supported by the renderer side protocol.
+    */
+   uint32_t vk_extension_mask1[32];
+
+   /* The single-threaded renderer cannot afford potential blocking calls. It
+    * also leads to GPU lost if the wait depends on a following command. This
+    * capset allows such blocking calls to passthrough from the clients, and
+    * shifts the responsibilities to the client drivers.
+    */
+   uint32_t allow_vk_wait_syncs;
+
+   /* This flag indicates that the renderer supports multiple fencing
+    * timelines. The client driver is expected to associate each VkQueue with
+    * one of these timelines at queue creation by binding it with an unused
+    * ring_idx. Queues created without a ring_idx binding are associated to a
+    * shared legacy timeline. The special ring_idx==0 is reserved for CPU
+    * fences that are signaled by the renderer immediately upon consumption of
+    * the associated renderer submission.
+    */
+   uint32_t supports_multiple_timelines;
+
+   /* This flag indicates to the guest that hypervisor does not support memory
+    * pages injections and blob allocations must be done by guest from the
+    * dedicated heap (Host visible memory).
+    */
+   uint32_t use_guest_vram;
+};
+
+#endif /* VENUS_HW_H */
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
new file mode 100644
index 0000000000000..7ce0dbb7fbc67
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -0,0 +1,87 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h"
+
+#include "ggml-remoting.h"
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  if (tensor->data) {
+    if (!tensor->buffer) {
+      FATAL("tensor has data but not buffer :/");
+    }
+    // tensor->data is serialized as an offset to the buffer base address
+    result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  }
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
new file mode 100644
index 0000000000000..82b51838997c6
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -0,0 +1,53 @@
+#include "virtgpu-forward-impl.h"
+
+static long long current_time_ms() {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+ggml_status
+apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
+
+  std::vector<uint8_t> cgraph_data;
+  size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data);
+
+  struct vn_renderer_shmem *shmem;
+  if (cgraph_size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, cgraph_size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  //INFO("Send shmem ID %d", shmem->res_id);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  //INFO("Send shmem size %lu", cgraph_size);
+  vn_encode_size_t(encoder, &cgraph_size);
+
+  char *shmem_data = (char *) shmem->mmap_ptr;
+  struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size);
+
+  vn_encode_cgraph_data(&secondary_enc, cgraph_data);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  ggml_status status = GGML_STATUS_ABORTED;
+  vn_decode_ggml_status(decoder, &status);
+  //INFO("Received status %u", status);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+
+  return status;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
new file mode 100644
index 0000000000000..e991c0bef324d
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -0,0 +1,114 @@
+#include "virtgpu-forward-impl.h"
+
+const char *
+apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  //INFO("%s: Forward BUFT NAME --> %s", __func__, string);
+
+  /* *** */
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return string;
+}
+
+size_t
+apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  size_t alignment;
+  vn_decode_size_t(decoder, &alignment);
+
+  INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return alignment;
+}
+
+size_t
+apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  size_t max_size;
+  vn_decode_size_t(decoder, &max_size);
+
+  INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return max_size;
+}
+
+bool
+apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  bool is_host;
+  vn_decode_bool_t(decoder, &is_host);
+
+  INFO("%s: buffer is host? %d", __func__, is_host);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return is_host;
+}
+
+apir_buffer_context_t
+apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  apir_buffer_context_t buffer_context;
+  INFO("%s: allocate device memory (%lu)", __func__,  size);
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
+
+  vn_encode_ggml_buffer_type(encoder, buft);
+
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return buffer_context;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
new file mode 100644
index 0000000000000..04041ab5feb37
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -0,0 +1,145 @@
+#include "virtgpu-forward-impl.h"
+
+void *
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  uintptr_t base;
+  vn_decode_uintptr_t(decoder, &base);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  //INFO("%s: received base %p\n", __func__,  (void *) base);
+
+  return (void *) base;
+}
+
+void
+apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+#if 0
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+       buffer_context->host_handle, tensor, data, offset, size);
+#endif
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  memcpy(shmem->mmap_ptr, data, size);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+
+  return;
+}
+
+#if false
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  UNUSED(gpu);
+  UNUSED(tensor);
+  char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr;
+
+  memcpy(data, buffer_base_addr+offset, size);
+}
+#else
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
+  }
+
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  memcpy(data, shmem->mmap_ptr, size);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
+}
+#endif
+
+void
+apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		  uint8_t value) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+  vn_encode_uint8_t(encoder, &value);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+}
+
+
+void
+apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
+
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
new file mode 100644
index 0000000000000..ca036366a6752
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -0,0 +1,236 @@
+#include "virtgpu-forward-impl.h"
+
+int
+apir_device_get_count(struct virtgpu *gpu) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return dev_count;
+  }
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_int32_t(decoder, &dev_count);
+
+  INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return dev_count;
+}
+
+const char *
+apir_device_get_name(struct virtgpu *gpu) {
+  static char *string = nullptr;
+  if (string) {
+    CACHED;
+    return string;
+  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME);
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV NAME --> %s", __func__, string);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return string;
+}
+
+const char *
+apir_device_get_description(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device description buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV DESCR --> %s", __func__, string);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return string;
+}
+
+uint32_t
+apir_device_get_type(struct virtgpu *gpu) {
+  static uint32_t dev_type = 255;
+  if (dev_type != 255) {
+    CACHED;
+    return dev_type;
+  }
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_uint32_t(decoder, &dev_type);
+
+  INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return dev_type;
+}
+
+void
+apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
+  static size_t dev_free = 0;
+  static size_t dev_total = 0;
+  /*
+  if (dev_total != 0) {
+    WARNING("Not sure if llama.cpp expects fresh information for the free memory ...");
+    *free = dev_free;
+    *total = dev_total;
+
+    CACHED;
+    return;
+  }
+  */
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_size_t(decoder, &dev_free);
+  vn_decode_size_t(decoder, &dev_total);
+
+  *free = dev_free;
+  *total = dev_total;
+
+  INFO("%s: Forward DEV FREE  mem --> %zu MB", __func__, dev_free / 1024 / 1024);
+  INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024);
+
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return;
+}
+
+bool
+apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
+#if APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE
+  /* ggml-rpc cheats it like this */
+  /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
+  UNUSED(gpu);
+  UNUSED(op);
+
+  return true;
+#else
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
+
+  vn_encode_ggml_tensor_inline(encoder, op);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  bool supports_op;
+  vn_decode_bool_t(decoder, &supports_op);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return supports_op;
+#endif
+}
+
+apir_buffer_type_host_handle_t
+apir_device_get_buffer_type(struct virtgpu *gpu) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  apir_buffer_type_host_handle_t buft_handle;
+  vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return buft_handle;
+}
+
+void
+apir_device_get_props(struct virtgpu *gpu,
+		      bool *async,
+		      bool *host_buffer,
+		      bool *buffer_from_host_ptr,
+		      bool *events) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_bool_t(decoder, async);
+  vn_decode_bool_t(decoder, host_buffer);
+  vn_decode_bool_t(decoder, buffer_from_host_ptr);
+  vn_decode_bool_t(decoder, events);
+
+  /* *** */
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return;
+}
+
+apir_buffer_context_t
+apir_device_buffer_from_ptr(struct virtgpu *gpu,
+			    size_t size,
+			    size_t max_tensor_size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  apir_buffer_context_t buffer_context;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
+
+  /* *** */
+
+  buffer_context.shmem = virtgpu_shmem_create(gpu, size);
+  if (!buffer_context.shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+
+  vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
+
+  vn_encode_size_t(encoder, &size);
+  vn_encode_size_t(encoder, &max_tensor_size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+  buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
+
+  /* *** */
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return buffer_context;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
new file mode 100644
index 0000000000000..26510b20bc479
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
@@ -0,0 +1,34 @@
+#include "ggml-backend-impl.h"
+#include "ggml-remoting.h"
+#include "virtgpu.h"
+#include "../ggml-remotingbackend/shared/apir_backend.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml.h"
+
+#define CACHED
+//  printf("INFO: ### found response in the cache %s\n", __func__)o
+
+
+#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__)		\
+  do {									\
+    int32_t forward_flag = (int32_t) apir_command_type__;		\
+    encoder_name = remote_call_prepare(gpu_dev_name, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); \
+    if (!encoder) {							\
+      FATAL("%s: failed to prepare the remote call encoder :/", __func__); \
+    }									\
+  } while(0)
+
+#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name) \
+  do {							      \
+    decoder_name = remote_call(gpu_dev_name, encoder_name);   \
+    if (!decoder) {					      \
+      FATAL("%s: failed to kick the remote call :/", __func__); \
+    }								      \
+  } while(0)
+
+#define REMOTE_CALL_FINISH(gpu_dev_name, encoder_name, decoder_name)	\
+  do {									\
+    int32_t ret = remote_call_finish(encoder_name, decoder_name);	\
+    if (ret != 0) {							\
+      FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); \
+    }									\
+  } while(0)
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
new file mode 100644
index 0000000000000..cc159e071e218
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -0,0 +1,46 @@
+#include "ggml.h"
+#include "ggml-impl.h"
+#include "ggml-alloc.h"
+
+#include "virtgpu-utils.h"
+
+#include "../ggml-remotingbackend/shared/apir_backend.h"
+
+/* device */
+int apir_device_get_count(struct virtgpu *gpu);
+const char *apir_device_get_name(struct virtgpu *gpu);
+const char *apir_device_get_description(struct virtgpu *gpu);
+uint32_t apir_device_get_type(struct virtgpu *gpu);
+void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
+bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu);
+void apir_device_get_props(struct virtgpu *gpu,
+			   bool *async,
+			   bool *host_buffer,
+			   bool *buffer_from_host_ptr,
+			   bool *events);
+apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu,
+						  size_t size,
+						  size_t max_tensor_size);
+/* buffer-type */
+const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size);
+
+/* buffer */
+
+void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
+enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor);
+void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+			    ggml_tensor *tensor, const void *data, size_t offset, size_t size);
+void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
+void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       uint8_t value);
+void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
+
+/* backend */
+
+ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
new file mode 100644
index 0000000000000..a09fd22371a8c
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -0,0 +1,111 @@
+#include <assert.h>
+
+#include "virtgpu-shm.h"
+
+static uint32_t
+virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu,
+                                   uint32_t blob_mem,
+                                   uint32_t blob_flags,
+                                   size_t blob_size,
+                                   uint64_t blob_id,
+                                   uint32_t *res_id)
+{
+#ifdef SIMULATE_BO_SIZE_FIX
+   blob_size = align64(blob_size, 4096);
+#endif
+
+   struct drm_virtgpu_resource_create_blob args = {
+      .blob_mem = blob_mem,
+      .blob_flags = blob_flags,
+      .bo_handle = 0,
+      .res_handle = 0,
+      .size = blob_size,
+      .pad = 0,
+      .cmd_size = 0,
+      .cmd = 0,
+      .blob_id = blob_id,
+   };
+
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args))
+      return 0;
+
+   *res_id = args.res_handle;
+   return args.bo_handle;
+}
+
+static void
+virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle)
+{
+   struct drm_gem_close args = {
+      .handle = gem_handle,
+      .pad = 0,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
+   assert(!ret);
+#ifdef NDEBUG
+   UNUSED(ret);
+#endif
+}
+
+static void *
+virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
+{
+   struct drm_virtgpu_map args = {
+      .offset = 0,
+      .handle = gem_handle,
+      .pad = 0,
+   };
+
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
+      return NULL;
+
+   void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd,
+                    args.offset);
+   if (ptr == MAP_FAILED)
+      return NULL;
+
+   return ptr;
+}
+
+void
+virtgpu_shmem_destroy(struct virtgpu *gpu,
+                      struct virtgpu_shmem *shmem)
+{
+  munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
+  virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+}
+
+struct vn_renderer_shmem *
+virtgpu_shmem_create(struct virtgpu *gpu, size_t size)
+{
+   size = align64(size, 16384);
+
+   uint32_t res_id;
+   uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(
+      gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0,
+      &res_id);
+   if (!gem_handle)
+      return NULL;
+
+   void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
+   if (!ptr) {
+      virtgpu_ioctl_gem_close(gpu, gem_handle);
+      return NULL;
+   }
+   if (gpu->shmem_array.elem_size == 0) {
+     INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n");
+     assert(false);
+   }
+   struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle);
+
+   shmem->gem_handle = gem_handle;
+   shmem->base.res_id = res_id;
+   shmem->base.mmap_size = size;
+   shmem->base.mmap_ptr = ptr;
+   shmem->base.refcount.count = 1;
+   shmem->base.gem_handle = gem_handle;
+   shmem->base.shmem = shmem;
+
+   return &shmem->base;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
new file mode 100644
index 0000000000000..52217f5b7e857
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <atomic>
+#include <sys/mman.h>
+
+#include "virtgpu.h"
+#include "virtgpu-utils.h"
+
+struct vn_refcount {
+   int count; //atomic_int
+};
+
+
+struct vn_renderer_shmem {
+   struct vn_refcount refcount;
+
+   uint32_t res_id;
+   size_t mmap_size; /* for internal use only (i.e., munmap) */
+   void *mmap_ptr;
+
+   struct list_head cache_head;
+   int64_t cache_timestamp;
+
+   uint32_t gem_handle;
+
+   struct virtgpu_shmem *shmem;
+};
+
+struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size);
+void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem);
+
+
+struct virtgpu_shmem {
+   struct vn_renderer_shmem base;
+   uint32_t gem_handle;
+};
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
new file mode 100644
index 0000000000000..833f0e4680103
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -0,0 +1,200 @@
+#include "virtgpu-utils.h"
+#include <malloc.h>
+#include <cstring>
+#include <stdlib.h>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE 0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new) \
+   __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+void
+util_sparse_array_init(struct util_sparse_array *arr,
+                       size_t elem_size, size_t node_size)
+{
+   memset(arr, 0, sizeof(*arr));
+   arr->elem_size = elem_size;
+   arr->node_size_log2 = util_logbase2_64(node_size);
+   assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void *
+os_malloc_aligned(size_t size, size_t alignment)
+{
+   void *ptr;
+   alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+   if(posix_memalign(&ptr, alignment, size) != 0)
+      return NULL;
+   return ptr;
+}
+
+static inline void *
+_util_sparse_array_node_data(uintptr_t handle)
+{
+   return (void *)(handle & NODE_PTR_MASK);
+}
+
+static inline unsigned
+_util_sparse_array_node_level(uintptr_t handle)
+{
+   return handle & NODE_LEVEL_MASK;
+}
+
+static inline void
+_util_sparse_array_node_finish(struct util_sparse_array *arr,
+                               uintptr_t node)
+{
+   if (_util_sparse_array_node_level(node) > 0) {
+      uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node);
+      size_t node_size = 1ull << arr->node_size_log2;
+      for (size_t i = 0; i < node_size; i++) {
+         if (children[i])
+            _util_sparse_array_node_finish(arr, children[i]);
+      }
+   }
+
+   os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t
+_util_sparse_array_node(void *data, unsigned level)
+{
+   assert(data != NULL);
+   assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0);
+   assert((level & NODE_PTR_MASK) == 0);
+   return (uintptr_t)data | level;
+}
+
+inline uintptr_t
+_util_sparse_array_node_alloc(struct util_sparse_array *arr,
+                              unsigned level)
+{
+   size_t size;
+   if (level == 0) {
+      size = arr->elem_size << arr->node_size_log2;
+   } else {
+      size = sizeof(uintptr_t) << arr->node_size_log2;
+   }
+
+   void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+   memset(data, 0, size);
+
+   return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t
+_util_sparse_array_set_or_free_node(uintptr_t *node_ptr,
+                                    uintptr_t cmp_node,
+                                    uintptr_t node)
+{
+   uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+   if (prev_node != cmp_node) {
+      /* We lost the race.  Free this one and return the one that was already
+       * allocated.
+       */
+      os_free_aligned(_util_sparse_array_node_data(node));
+      return prev_node;
+   } else {
+      return node;
+   }
+}
+
+void *
+util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx)
+{
+   const unsigned node_size_log2 = arr->node_size_log2;
+   uintptr_t root = p_atomic_read(&arr->root);
+   if (unlikely(!root)) {
+      unsigned root_level = 0;
+      uint64_t idx_iter = idx >> node_size_log2;
+      while (idx_iter) {
+         idx_iter >>= node_size_log2;
+         root_level++;
+      }
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+      root = _util_sparse_array_set_or_free_node(&arr->root,
+                                                 NULL_NODE, new_root);
+   }
+
+   while (1) {
+      unsigned root_level = _util_sparse_array_node_level(root);
+      uint64_t root_idx = idx >> (root_level * node_size_log2);
+      if (likely(root_idx < (1ull << node_size_log2)))
+         break;
+
+      /* In this case, we have a root but its level is low enough that the
+       * requested index is out-of-bounds.
+       */
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+      uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+      new_root_children[0] = root;
+
+      /* We only add one at a time instead of the whole tree because it's
+       * easier to ensure correctness of both the tree building and the
+       * clean-up path.  Because we're only adding one node we never have to
+       * worry about trying to free multiple things without freeing the old
+       * things.
+       */
+      root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+   }
+
+   void *node_data = _util_sparse_array_node_data(root);
+   unsigned node_level = _util_sparse_array_node_level(root);
+   while (node_level > 0) {
+      uint64_t child_idx = (idx >> (node_level * node_size_log2)) &
+                           ((1ull << node_size_log2) - 1);
+
+      uintptr_t *children = (uintptr_t *) node_data;
+      uintptr_t child = p_atomic_read(&children[child_idx]);
+
+      if (unlikely(!child)) {
+         child = _util_sparse_array_node_alloc(arr, node_level - 1);
+         child = _util_sparse_array_set_or_free_node(&children[child_idx],
+                                                     NULL_NODE, child);
+      }
+
+      node_data = _util_sparse_array_node_data(child);
+      node_level = _util_sparse_array_node_level(child);
+   }
+
+   uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+   return (void *)((char *)node_data + (elem_idx * arr->elem_size));
+}
+
+void *something = NULL;
+void thks_bye () {
+  // break here
+  INFO("thks bye, stopping early and happilly :)");
+  if (!something) { // avoid the [[noreturn]] detection mechanism
+    exit(0);
+  }
+}
+
+void breakpoint() {
+  // break here
+  INFO("breakpoint here :)");
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
new file mode 100644
index 0000000000000..6b69ebc6329ca
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <cassert>
+#include <cstddef>
+#include <ctime>
+#include <cerrno>
+#include <atomic>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x) __builtin_expect(!!(x), 1)
+
+#ifndef UNUSED
+#define UNUSED(x) (void)(x)
+#endif
+
+/** Checks is a value is a power of two. Does not handle zero. */
+#define IS_POT(v) (((v) & ((v) - 1)) == 0)
+
+/** Checks is a value is a power of two. Zero handled. */
+#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+
+void thks_bye();
+void breakpoint();
+
+#ifndef NDEBUG
+inline void
+INFO(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+#else
+inline void
+INFO(...) {}
+#endif
+
+inline void
+WARNING(const char *format, ...) {
+  fprintf(stderr, "WARNING: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+FATAL(const char *format, ...) {
+  fprintf(stderr, "FATAL: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+  assert(false);
+}
+
+static inline bool
+util_is_power_of_two_nonzero64(uint64_t v)
+{
+   return IS_POT_NONZERO(v);
+}
+
+static inline uint64_t
+align64(uint64_t value, uint64_t alignment)
+{
+   assert(util_is_power_of_two_nonzero64(alignment));
+   return ALIGN_POT(value, alignment);
+}
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+struct util_sparse_array {
+   size_t elem_size;
+   unsigned node_size_log2;
+
+   uintptr_t root;
+};
+
+void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx);
+void util_sparse_array_init(struct util_sparse_array *arr,
+			    size_t elem_size, size_t node_size);
+
+inline void
+os_time_sleep(int64_t usecs)
+{
+   struct timespec time;
+   time.tv_sec = usecs / 1000000;
+   time.tv_nsec = (usecs % 1000000) * 1000;
+   while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
new file mode 100644
index 0000000000000..66bbf17ac6d63
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -0,0 +1,497 @@
+#include <stdio.h>
+#include <cassert>
+#include <cerrno>
+#include <unistd.h>
+
+#include <cstdlib>
+
+#include "virtgpu.h"
+
+static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu);
+
+
+static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu);
+
+static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
+				      enum virgl_renderer_capset capset_id);
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size);
+static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
+static void virtgpu_init_renderer_info(struct virtgpu *gpu);
+
+static inline void
+virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
+{
+   /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory.  They are
+    * logically contiguous in the guest but are sglists (iovecs) in the host.
+    * That makes them slower to process in the host.  With host process
+    * isolation, it also becomes impossible for the host to access sglists
+    * directly.
+    *
+    * While there are ideas (and shipped code in some cases) such as creating
+    * udmabufs from sglists, or having a dedicated guest heap, it seems the
+    * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D.  That is, when the
+    * renderer sees a request to export a blob where
+    *
+    *  - blob_mem is VIRTGPU_BLOB_MEM_HOST3D
+    *  - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE
+    *  - blob_id is 0
+    *
+    * it allocates a host shmem.
+    *
+    * supports_blob_id_0 has been enforced by mandated render server config.
+    */
+   assert(gpu->capset.data.supports_blob_id_0);
+   gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+}
+
+struct virtgpu *
+create_virtgpu() {
+  struct virtgpu *gpu = new struct virtgpu();
+
+  util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem),
+			 1024);
+
+  virt_gpu_result_t result = virtgpu_open(gpu);
+  assert(result == APIR_SUCCESS);
+
+  result = virtgpu_init_params(gpu);
+  assert(result == APIR_SUCCESS);
+
+  result = virtgpu_init_capset(gpu);
+  assert(result == APIR_SUCCESS);
+
+  result = virtgpu_init_context(gpu);
+  assert(result == APIR_SUCCESS);
+
+#ifdef NDEBUG
+   UNUSED(result);
+#endif
+
+  virtgpu_init_shmem_blob_mem(gpu);
+
+  gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000);
+  gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB
+
+  if (!gpu->reply_shmem) {
+    FATAL("%s: failed to create the shared reply memory pages :/", __func__);
+  }
+
+  if (!gpu->data_shmem) {
+    FATAL("%s: failed to create the shared data memory pages :/", __func__);
+  }
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  int32_t ret;
+
+  encoder = remote_call_prepare(gpu,  VIRGL_APIR_COMMAND_TYPE_LoadLibrary, 0);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+  decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret);
+  }
+
+  return gpu;
+}
+
+
+static virt_gpu_result_t
+virtgpu_open(struct virtgpu *gpu)
+{
+   drmDevicePtr devs[8];
+   int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
+   if (count < 0) {
+     INFO("failed to enumerate DRM devices");
+     return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
+   for (int i = 0; i < count; i++) {
+      result = virtgpu_open_device(gpu, devs[i]);
+      if (result == APIR_SUCCESS)
+         break;
+   }
+
+   drmFreeDevices(devs, count);
+
+   return result;
+}
+
+static virt_gpu_result_t
+virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
+{
+   bool supported_bus = false;
+
+   switch (dev->bustype) {
+   case DRM_BUS_PCI:
+      if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID &&
+          dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID)
+         supported_bus = true;
+      break;
+   case DRM_BUS_PLATFORM:
+      supported_bus = true;
+      break;
+   default:
+      break;
+   }
+
+   if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) {
+      if (VN_DEBUG(INIT)) {
+         const char *name = "unknown";
+         for (uint32_t i = 0; i < DRM_NODE_MAX; i++) {
+            if (dev->available_nodes & (1 << i)) {
+               name = dev->nodes[i];
+               break;
+            }
+         }
+         vn_log(gpu->instance, "skipping DRM device %s", name);
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   const char *primary_path = dev->nodes[DRM_NODE_PRIMARY];
+   const char *node_path = dev->nodes[DRM_NODE_RENDER];
+
+   int fd = open(node_path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      if (VN_DEBUG(INIT))
+         vn_log(gpu->instance, "failed to open %s", node_path);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   drmVersionPtr version = drmGetVersion(fd);
+   if (!version || strcmp(version->name, "virtio_gpu") ||
+       version->version_major != 0) {
+      if (VN_DEBUG(INIT)) {
+         if (version) {
+            vn_log(gpu->instance, "unknown DRM driver %s version %d",
+                   version->name, version->version_major);
+         } else {
+            vn_log(gpu->instance, "failed to get DRM driver version");
+         }
+      }
+      if (version)
+         drmFreeVersion(version);
+      close(fd);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   gpu->fd = fd;
+
+   struct stat st;
+   if (stat(primary_path, &st) == 0) {
+      gpu->has_primary = true;
+      gpu->primary_major = major(st.st_rdev);
+      gpu->primary_minor = minor(st.st_rdev);
+   } else {
+      gpu->has_primary = false;
+      gpu->primary_major = 0;
+      gpu->primary_minor = 0;
+   }
+   stat(node_path, &st);
+   gpu->render_major = major(st.st_rdev);
+   gpu->render_minor = minor(st.st_rdev);
+
+   gpu->bustype = dev->bustype;
+   if (dev->bustype == DRM_BUS_PCI)
+      gpu->pci_bus_info = *dev->businfo.pci;
+
+   drmFreeVersion(version);
+
+   INFO("using DRM device %s", node_path);
+
+   return APIR_SUCCESS;
+}
+
+void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+{
+   if (instance) {
+     printf("<INST>");
+   }
+
+   va_list ap;
+
+   va_start(ap, format);
+   vprintf(format, ap);
+   va_end(ap);
+
+   /* instance may be NULL or partially initialized */
+}
+
+static virt_gpu_result_t
+virtgpu_init_context(struct virtgpu *gpu)
+{
+   assert(!gpu->capset.version);
+   const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to initialize context: %s",
+                strerror(errno));
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t
+virtgpu_init_capset(struct virtgpu *gpu)
+{
+   gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS;
+   gpu->capset.version = 0;
+
+   const int ret =
+      virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version,
+                             &gpu->capset.data, sizeof(gpu->capset.data));
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to get venus v%d capset: %s",
+                gpu->capset.version, strerror(errno));
+      }
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return APIR_SUCCESS;
+}
+
+static virt_gpu_result_t
+virtgpu_init_params(struct virtgpu *gpu)
+{
+   const uint64_t required_params[] = {
+      VIRTGPU_PARAM_3D_FEATURES,   VIRTGPU_PARAM_CAPSET_QUERY_FIX,
+      VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT,
+   };
+   uint64_t val;
+   for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) {
+      val = virtgpu_ioctl_getparam(gpu, required_params[i]);
+      if (!val) {
+         if (VN_DEBUG(INIT)) {
+            vn_log(gpu->instance, "required kernel param %d is missing",
+                   (int)required_params[i]);
+         }
+         return APIR_ERROR_INITIALIZATION_FAILED;
+      }
+   }
+
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE);
+   if (val) {
+      gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+   } else {
+      val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM);
+      if (val) {
+         gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM;
+      }
+   }
+
+   if (!val) {
+      vn_log(gpu->instance,
+             "one of required kernel params (%d or %d) is missing",
+             (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM);
+      return APIR_ERROR_INITIALIZATION_FAILED;
+   }
+
+   /* Cross-device feature is optional.  It enables sharing dma-bufs
+    * with other virtio devices, like virtio-wl or virtio-video used
+    * by ChromeOS VMs.  Qemu doesn't support cross-device sharing.
+    */
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE);
+   if (val)
+      gpu->supports_cross_device = true;
+
+   /* implied by CONTEXT_INIT uapi */
+   gpu->max_timeline_count = 64;
+
+   return APIR_SUCCESS;
+}
+
+static int
+virtgpu_ioctl_context_init(struct virtgpu *gpu,
+                           enum virgl_renderer_capset capset_id)
+{
+   struct drm_virtgpu_context_set_param ctx_set_params[3] = {
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
+         .value = capset_id,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
+         .value = 64,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
+         .value = 0, /* don't generate drm_events on fence signaling */
+      },
+   };
+
+   struct drm_virtgpu_context_init args = {
+      .num_params = ARRAY_SIZE(ctx_set_params),
+      .pad = 0,
+      .ctx_set_params = (uintptr_t)&ctx_set_params,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
+}
+
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size)
+{
+   struct drm_virtgpu_get_caps args = {
+      .cap_set_id = id,
+      .cap_set_ver = version,
+      .addr = (uintptr_t)capset,
+      .size = (__u32) capset_size,
+      .pad = 0,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+static uint64_t
+virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
+{
+   /* val must be zeroed because kernel only writes the lower 32 bits */
+   uint64_t val = 0;
+   struct drm_virtgpu_getparam args = {
+      .param = param,
+      .value = (uintptr_t)&val,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
+   return ret ? 0 : val;
+}
+
+
+struct vn_cs_encoder *
+remote_call_prepare(
+  struct virtgpu *gpu,
+  int32_t cmd_type,
+  int32_t cmd_flags)
+{
+
+  if (!gpu->reply_shmem) {
+    FATAL("%s: the reply shmem page can't be null", __func__);
+  }
+
+  /*
+   * Prepare the command encoder and its buffer
+   */
+
+  static char encoder_buffer[4096];
+
+  static struct vn_cs_encoder enc;
+  enc = {
+    encoder_buffer,
+    encoder_buffer,
+    encoder_buffer + sizeof(encoder_buffer),
+  };
+
+  /*
+   * Fill the command encoder with the common args:
+   * - cmd_type (int32_t)
+   * - cmd_flags (int32_t)
+   * - reply res id (uint32_t)
+   */
+
+  vn_encode_int32_t(&enc, &cmd_type);
+  vn_encode_int32_t(&enc, &cmd_flags);
+
+  uint32_t reply_res_id = gpu->reply_shmem->res_id;
+  vn_encode_uint32_t(&enc, &reply_res_id);
+
+  return &enc;
+}
+
+int32_t
+remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  if (!enc) {
+    WARNING("Invalid (null) encoder :/");
+  }
+  if (!dec) {
+    FATAL("Invalid (null) decoder :/");
+  }
+  int32_t remote_call_ret;
+  vn_decode_int32_t(dec, &remote_call_ret);
+
+  // encoder and decoder are statically allocated, nothing to do to release them
+
+  return remote_call_ret;
+}
+
+struct vn_cs_decoder *
+remote_call(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *encoder
+  )
+{
+  /*
+   * Prepare the reply notification pointer
+   */
+
+  volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr;
+  *atomic_reply_notif = 0;
+
+  /*
+   * Trigger the execbuf ioctl
+   */
+
+  struct drm_virtgpu_execbuffer args = {
+    .flags = VIRTGPU_EXECBUF_RING_IDX,
+    .size = (uint32_t) (encoder->cur - encoder->start),
+    .command = (uintptr_t) encoder->start,
+
+    .bo_handles = 0,
+    .num_bo_handles = 0,
+
+    .fence_fd = 0,
+    .ring_idx = 0,
+    .syncobj_stride = 0,
+    .num_in_syncobjs = 0,
+    .num_out_syncobjs = 0,
+    .in_syncobjs = 0,
+    .out_syncobjs = 0,
+  };
+
+  int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
+
+  if (ret != 0) {
+    FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret);
+  }
+  /*
+   * Wait for the response notification
+   */
+
+  while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) {
+    int64_t base_sleep_us = 15;
+
+    os_time_sleep(base_sleep_us);
+  }
+
+  /*
+   * Prepare the decoder
+   */
+  static struct vn_cs_decoder dec;
+  dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif);
+  dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size;
+
+  return &dec;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
new file mode 100644
index 0000000000000..9d8668c3d070e
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <xf86drm.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <threads.h>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include "virtgpu-forward.h"
+#include "virtgpu-utils.h"
+#include "../ggml-remotingbackend/shared/api_remoting.h"
+#include "../ggml-remotingbackend/shared/venus_cs.h"
+
+#include "virtgpu-shm.h"
+
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "drm-uapi/virtgpu_drm.h"
+#include "venus_hw.h"
+
+// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads
+enum virgl_renderer_capset {
+   VIRGL_RENDERER_CAPSET_VIRGL                   = 1,
+   VIRGL_RENDERER_CAPSET_VIRGL2                  = 2,
+   /* 3 is reserved for gfxstream */
+   VIRGL_RENDERER_CAPSET_VENUS                   = 4,
+   /* 5 is reserved for cross-domain */
+   VIRGL_RENDERER_CAPSET_DRM                     = 6,
+};
+
+/* from src/virtio/vulkan/vn_renderer_virtgpu.c */
+#define VIRTGPU_PCI_VENDOR_ID 0x1af4
+#define VIRTGPU_PCI_DEVICE_ID 0x1050
+#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004
+#define VIRTGPU_PARAM_GUEST_VRAM 9
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define VN_DEBUG(what) true
+
+typedef enum virt_gpu_result_t {
+    APIR_SUCCESS = 0,
+    APIR_ERROR_INITIALIZATION_FAILED = -1,
+} virt_gpu_result_t;
+
+
+struct remoting_dev_instance {
+  int yes;
+};
+
+#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+
+inline void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+   PRINTFLIKE(2, 3);
+
+
+struct virtgpu {
+   struct remoting_dev_instance *instance;
+
+   int fd;
+
+   bool has_primary;
+   int primary_major;
+   int primary_minor;
+   int render_major;
+   int render_minor;
+
+   int bustype;
+   drmPciBusInfo pci_bus_info;
+
+   uint32_t max_timeline_count;
+
+   struct {
+      enum virgl_renderer_capset id;
+      uint32_t version;
+      struct virgl_renderer_capset_venus data;
+   } capset;
+
+   uint32_t shmem_blob_mem;
+   uint32_t bo_blob_mem;
+
+   /* note that we use gem_handle instead of res_id to index because
+    * res_id is monotonically increasing by default (see
+    * virtio_gpu_resource_id_get)
+    */
+  struct util_sparse_array shmem_array;
+  // struct util_sparse_array bo_array;
+
+   mtx_t dma_buf_import_mutex;
+
+  //   struct virtgpu_shmem_cache shmem_cache;
+
+   bool supports_cross_device;
+
+  /* KP */
+  struct vn_renderer_shmem *reply_shmem;
+  struct vn_renderer_shmem *data_shmem;
+};
+
+
+static inline int
+virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
+{
+   return drmIoctl(gpu->fd, request, args);
+}
+
+struct virtgpu *create_virtgpu();
+
+struct vn_cs_encoder *remote_call_prepare(
+  struct virtgpu *gpu,
+  int32_t cmd_type,
+  int32_t cmd_flags);
+struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc);
+int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
diff --git a/podman_compile.sh b/podman_compile.sh
new file mode 100755
index 0000000000000..de9e5c88d57a7
--- /dev/null
+++ b/podman_compile.sh
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+
+set -o pipefail
+set -o errexit
+set -o nounset
+set -o errtrace
+
+opts=""
+opts="$opts --device /dev/dri "
+echo "Running with the GPU passthrough"
+
+IMAGE=quay.io/ramalama/remoting:latest
+
+what=${1:-}
+if [[ -z "$what" ]]; then
+    what=remoting
+fi
+
+cmd="bash ./build.$what.sh"
+
+POD_NAME=mac_ai_compiling
+podman machine ssh podman rm $POD_NAME --force
+
+set -x
+podman run \
+--name $POD_NAME \
+--user root:root \
+--cgroupns host \
+--security-opt label=disable \
+--env HOME="$HOME" \
+--env PERF_MODE="${PERF_MODE:-}" \
+--env BENCH_MODE="${BENCH_MODE:-}" \
+-v "$HOME":"$HOME":Z \
+-w "$PWD" \
+-it --rm \
+$opts \
+$IMAGE \
+$cmd
diff --git a/prepare.backend.sh b/prepare.backend.sh
new file mode 100755
index 0000000000000..93373ec5b7a5c
--- /dev/null
+++ b/prepare.backend.sh
@@ -0,0 +1,11 @@
+cmake -S . -B ../build.remoting-backend \
+      -DGGML_REMOTINGBACKEND=ON \
+      -DGGML_NATIVE=OFF \
+      -DGGML_METAL=ON \
+      -DGGML_BACKEND_DL=OFF \
+      -DLLAMA_CURL=OFF \
+      -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \
+      "$@"
+
+#      -DCMAKE_BUILD_TYPE=Debug \
+#
diff --git a/prepare.remoting.sh b/prepare.remoting.sh
new file mode 100755
index 0000000000000..5ab73470477b1
--- /dev/null
+++ b/prepare.remoting.sh
@@ -0,0 +1,8 @@
+cmake -S . -B ../build.remoting-frontend \
+      -DGGML_REMOTINGFRONTEND=ON \
+      -DGGML_CPU_ARM_ARCH=native \
+      -DGGML_NATIVE=OFF \
+      -DGGML_OPENMP=OFF \
+      -DLLAMA_CURL=OFF \
+      -DCMAKE_BUILD_TYPE=Debug \
+      "$@"
diff --git a/prepare.sh b/prepare.sh
new file mode 100644
index 0000000000000..2fb46cefd426c
--- /dev/null
+++ b/prepare.sh
@@ -0,0 +1 @@
+cmake -S . -B ./build -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF #-DCMAKE_BUILD_TYPE=Debug #-DGGML_VULKAN_DEBUG=1
diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh
new file mode 100644
index 0000000000000..7bacf9b21a9ca
--- /dev/null
+++ b/prepare.vulkan.sh
@@ -0,0 +1,6 @@
+cmake -S . \
+      -B ../build.vulkan \
+      -DGGML_VULKAN=ON \
+      -DGGML_NATIVE=OFF \
+      -DGGML_METAL=OFF \
+      -DCMAKE_BUILD_TYPE=Debug
diff --git a/run.remoting.sh b/run.remoting.sh
new file mode 100755
index 0000000000000..11ac21f4b1b14
--- /dev/null
+++ b/run.remoting.sh
@@ -0,0 +1,50 @@
+#! /bin/bash
+#clear
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+else
+    prefix=""
+fi
+
+MODEL=${MODEL:-llama3.2}
+
+LLAMA_BUILD_DIR=../build.remoting-frontend$FLAVOR
+
+MODEL_HOME="$HOME/models"
+
+set -x
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    cat <<EOF
+###
+### Running llama-bench
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-bench \
+        --model "$MODEL_HOME/$MODEL" \
+        --n-gpu-layers 99 \
+        --threads 1
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    cat <<EOF
+###
+### Running test-backend-ops perf
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/test-backend-ops perf
+
+else
+    #PROMPT="say nothing"
+    PROMPT="tell what's Apple metal API"
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-run \
+        --ngl 99 \
+        --verbose \
+        --context-size 4096 \
+        "$MODEL_HOME/$MODEL" \
+        "$PROMPT"
+fi
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000000000..13d8c042515f0
--- /dev/null
+++ b/run.sh
@@ -0,0 +1 @@
+./build/bin/llama-run --ngl 999 --verbose ~/models/llama3.2 "say nothing"
diff --git a/run.vulkan.sh b/run.vulkan.sh
new file mode 100755
index 0000000000000..a84d4831d478f
--- /dev/null
+++ b/run.vulkan.sh
@@ -0,0 +1,23 @@
+#! /bin/bash
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+else
+    prefix=""
+fi
+
+rm -f /usr/lib64/libvulkan_virtio.so
+
+ICD_DIR=/Users/kevinpouget/.local/share/vulkan/icd.d
+
+USE_WORK_MESA=1
+if [[ "$USE_WORK_MESA" == 1 ]]; then
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.aarch64.json
+else
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.good.aarch64.json
+fi
+
+# init result vtest wsi no_abort log_ctx_info cache no_sparse no_gpl
+export VN_DEBUG=vtest
+$prefix ../build.vulkan/bin/llama-run --verbose ~/models/llama3.2 "say nothing"
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f56a58e9b6ec6..8e7c5a0f4549b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -104,7 +104,7 @@ llama_context::llama_context(
     cparams.op_offload = params.op_offload;
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
+/*
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
@@ -114,7 +114,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
+*/
     if (n_ctx_per_seq < hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
new file mode 100644
index 0000000000000..04d593ce21477
--- /dev/null
+++ b/src/llama-kv-cache.cpp
@@ -0,0 +1,1382 @@
+#include "llama-kv-cache.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-cparams.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <map>
+#include <stdexcept>
+
+llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) {
+}
+
+bool llama_kv_cache_unified::init(
+        const llama_model & model,
+      const llama_cparams & cparams,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                 uint32_t   kv_size,
+                     bool   offload) {
+    const int32_t n_layer = hparams.n_layer;
+
+    has_shift = false;
+
+    recurrent = llama_model_is_recurrent(&model);
+    v_trans   = !recurrent && !cparams.flash_attn;
+    can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
+            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
+
+    head = 0;
+    size = kv_size;
+    used = 0;
+
+    this->type_k = type_k;
+    this->type_v = type_v;
+
+    cells.clear();
+    cells.resize(kv_size);
+
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+
+            return ctx;
+        }
+
+        return it->second;
+    };
+
+    k_l.reserve(n_layer);
+    v_l.reserve(n_layer);
+
+    for (int i = 0; i < n_layer; i++) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
+
+        const char * dev_name = "CPU";
+
+        ggml_backend_buffer_type_t buft;
+        if (offload) {
+            auto * dev = model.dev_layer(i);
+            buft = ggml_backend_dev_buffer_type(dev);
+
+            dev_name = ggml_backend_dev_name(dev);
+        } else {
+            buft = ggml_backend_cpu_buffer_type();
+        }
+
+	/*
+        LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
+                i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+	*/
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
+            return false;
+        }
+
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+        ggml_format_name(k, "cache_k_l%d", i);
+        ggml_format_name(v, "cache_v_l%d", i);
+        k_l.push_back(k);
+        v_l.push_back(v);
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        auto * buft = it.first;
+        auto * ctx  = it.second;
+
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        bufs.emplace_back(buf);
+    }
+
+    return true;
+}
+
+int32_t llama_kv_cache_unified::get_n_tokens() const {
+    int32_t result = 0;
+
+    for (uint32_t i = 0; i < size; i++) {
+        result += cells[i].seq_id.size();
+    }
+
+    return result;
+}
+
+int32_t llama_kv_cache_unified::get_used_cells() const {
+    return used;
+}
+
+size_t llama_kv_cache_unified::total_size() const {
+    size_t size = 0;
+    for (const auto & buf : bufs) {
+        size += ggml_backend_buffer_get_size(buf.get());
+    }
+
+    return size;
+}
+
+llama_pos llama_kv_cache_unified::pos_max() const {
+    llama_pos pos_max = -1;
+    for (const auto & cell : cells) {
+        pos_max = std::max(pos_max, cell.pos);
+    }
+
+    return pos_max;
+}
+
+void llama_kv_cache_unified::clear() {
+    for (int32_t i = 0; i < (int32_t) size; ++i) {
+        cells[i].pos = -1;
+        cells[i].seq_id.clear();
+        cells[i].src = -1;
+        cells[i].tail = -1;
+    }
+    head = 0;
+    used = 0;
+
+    for (auto & buf : bufs) {
+        ggml_backend_buffer_clear(buf.get(), 0);
+    }
+}
+
+bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // models like Mamba or RWKV can't have a state partially erased
+    if (recurrent) {
+        if (seq_id >= (int64_t) size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            int32_t & tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                const llama_kv_cell & cell = cells[tail_id];
+                // partial intersection is invalid
+                if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+                    return false;
+                }
+                // invalidate tails which will be cleared
+                if (p0 <= cell.pos && cell.pos < p1) {
+                    tail_id = -1;
+                }
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].pos >= p0 && cells[i].pos < p1) {
+            if (seq_id < 0) {
+                cells[i].seq_id.clear();
+            } else if (cells[i].has_seq_id(seq_id)) {
+                cells[i].seq_id.erase(seq_id);
+            } else {
+                continue;
+            }
+            if (cells[i].is_empty()) {
+                // keep count of the number of used cells
+                if (cells[i].pos >= 0) {
+                    used--;
+                }
+
+                cells[i].pos = -1;
+                cells[i].src = -1;
+
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+
+    return true;
+}
+
+void llama_kv_cache_unified::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    if (recurrent) {
+        if ((uint32_t) seq_id_dst < size && (uint32_t) seq_id_src < size) {
+            llama_kv_cell & tail_src = cells[seq_id_src];
+            llama_kv_cell & tail_dst = cells[seq_id_dst];
+            if (tail_dst.tail >= 0) {
+                // clear destination seq_id if it wasn't empty
+                llama_kv_cell & cell_dst = cells[tail_dst.tail];
+
+                cell_dst.seq_id.erase(seq_id_dst);
+                tail_dst.tail = -1;
+                if (cell_dst.seq_id.empty()) {
+                    cell_dst.pos = -1;
+                    cell_dst.delta = -1;
+                    cell_dst.src = -1;
+                    used -= 1;
+                }
+            }
+            if (tail_src.tail >= 0) {
+                llama_kv_cell & cell_src = cells[tail_src.tail];
+
+                cell_src.seq_id.insert(seq_id_dst);
+                tail_dst.tail = tail_src.tail;
+            }
+        }
+
+        return;
+    }
+
+    // otherwise, this is the KV of a Transformer-like model
+    head = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id_src) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+void llama_kv_cache_unified::seq_keep(llama_seq_id seq_id) {
+    uint32_t new_head = size;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (recurrent && (llama_seq_id) i != seq_id) {
+            cells[i].tail = -1;
+        }
+
+        if (!cells[i].has_seq_id(seq_id)) {
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+            cells[i].seq_id.clear();
+
+            if (new_head == size){
+                new_head = i;
+            }
+        } else {
+            cells[i].seq_id.clear();
+            cells[i].seq_id.insert(seq_id);
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    if (delta == 0) {
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos += delta;
+                }
+            }
+        }
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+            cells[i].pos   += delta;
+            cells[i].delta += delta;
+
+            if (cells[i].pos < 0) {
+                if (!cells[i].is_empty()) {
+                    used--;
+                }
+                cells[i].pos = -1;
+                cells[i].seq_id.clear();
+                if (new_head == size) {
+                    new_head = i;
+                }
+            }
+        }
+    }
+
+    // If we freed up a slot, set head to it so searching can start there.
+    // Otherwise we just start the next search from the beginning.
+    head = new_head != size ? new_head : 0;
+}
+
+void llama_kv_cache_unified::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (d == 1) {
+        return;
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+
+    // If there is no range then return early to avoid looping over the cache.
+    if (p0 == p1) {
+        return;
+    }
+
+    if (recurrent) {
+        // for Mamba-like or RWKV models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) size) {
+            const int32_t tail_id = cells[seq_id].tail;
+            if (tail_id >= 0) {
+                llama_kv_cell & cell = cells[tail_id];
+                if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                    cell.pos /= d;
+                }
+            }
+        }
+
+        return;
+    }
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id) && cells[i].pos >= p0 && cells[i].pos < p1) {
+            has_shift = true;
+
+            {
+                llama_pos p_old = cells[i].pos;
+                cells[i].pos   /= d;
+                cells[i].delta += cells[i].pos - p_old;
+            }
+        }
+    }
+}
+
+llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
+    llama_pos result = 0;
+
+    for (uint32_t i = 0; i < size; ++i) {
+        if (cells[i].has_seq_id(seq_id)) {
+            result = std::max(result, cells[i].pos);
+        }
+    }
+
+    return result;
+}
+
+void llama_kv_cache_unified::defrag() {
+    if (!recurrent) {
+        do_defrag = true;
+    }
+}
+
+void llama_kv_cache_unified::restore() {
+    if (pending.ranges.empty()) {
+        return;
+    }
+
+    // TODO: tmp - move to llama_kv_cache_recurrent
+    if (recurrent) {
+        seq_rm(-1, -1, -1);
+        return;
+    }
+
+    uint32_t new_head = size;
+
+    for (auto & range : pending.ranges) {
+        for (uint32_t i = range.c0; i < range.c1; ++i) {
+            cells[i].seq_id.clear();
+
+            // keep count of the number of used cells
+            if (cells[i].pos >= 0) {
+                used--;
+            }
+
+            cells[i].pos = -1;
+            cells[i].src = -1;
+        }
+
+        new_head = std::min(new_head, range.c0);
+    }
+
+    if (new_head != size && new_head < head) {
+        head = new_head;
+    }
+}
+
+void llama_kv_cache_unified::commit() {
+    // TODO: tmp - move to llama_kv_cache_recurrent
+    if (recurrent) {
+        return;
+    }
+
+    if (pending.ranges.empty()) {
+        LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/12695");
+        return;
+    }
+
+    pending.ranges.clear();
+}
+
+bool llama_kv_cache_unified::get_can_shift() const {
+    return can_shift;
+}
+
+bool llama_kv_cache_unified::find_slot(
+       const llama_ubatch & ubatch) {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    const uint32_t n_seqs   = ubatch.n_seqs;
+    const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    // if we have enough unused cells before the current head ->
+    //   better to start searching from the beginning of the cache, hoping to fill it
+    if (head > used + 2*ubatch.n_tokens) {
+        head = 0;
+    }
+
+    if (recurrent) {
+        // For recurrent state architectures (like Mamba or RWKV),
+        // each cache cell can store the state for a whole sequence.
+        // A slot should be always be contiguous.
+
+        // can only process batches with an equal number of new tokens in each sequence
+        GGML_ASSERT(ubatch.equal_seqs);
+
+        int32_t min = size - 1;
+        int32_t max = 0;
+
+        // everything should fit if all seq_ids are smaller than the max
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const uint32_t n_seq_id = ubatch.n_seq_id[s];
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+
+                if (seq_id < 0 || (uint32_t) seq_id >= size) {
+                    // too big seq_id
+                    // TODO: would it be possible to resize the cache instead?
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
+                    return false;
+                }
+                if (j > 0) {
+                    llama_kv_cell & seq = cells[seq_id];
+                    if (seq.tail >= 0) {
+                        llama_kv_cell & cell = cells[seq.tail];
+                        // clear cells from seq_ids that become shared
+                        // (should not normally happen, but let's handle it anyway)
+                        cell.seq_id.erase(seq_id);
+                        seq.tail = -1;
+                        if (cell.seq_id.empty()) {
+                            cell.pos = -1;
+                            cell.src = -1;
+                            used -= 1;
+                        }
+                    }
+                }
+            }
+        }
+
+#ifndef NDEBUG
+        {
+            std::vector<int32_t> tails_verif;
+            tails_verif.assign(size, -1);
+            for (uint32_t i = 0; i < size; ++i) {
+                llama_kv_cell & cell = cells[i];
+                for (llama_seq_id seq_id : cell.seq_id) {
+                    if (tails_verif[seq_id] != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
+                    }
+                    tails_verif[seq_id] = i;
+                }
+            }
+            for (uint32_t i = 0; i < size; ++i) {
+                if (tails_verif[i] != cells[i].tail) {
+                    LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cells[i].tail, tails_verif[i]);
+                }
+            }
+        }
+#endif
+
+        // find next empty cell
+        uint32_t next_empty_cell = head;
+
+        for (uint32_t i = 0; i < size; ++i) {
+            if (next_empty_cell >= size) { next_empty_cell -= size; }
+            llama_kv_cell & cell = cells[next_empty_cell];
+            if (cell.is_empty()) { break; }
+            next_empty_cell += 1;
+        }
+
+        // find usable cell range
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch.seq_id[s][0];
+            llama_kv_cell & seq_meta = cells[seq_id];
+            bool has_cell = false;
+            if (seq_meta.tail >= 0) {
+                llama_kv_cell & cell = cells[seq_meta.tail];
+                GGML_ASSERT(cell.has_seq_id(seq_id));
+                // does this seq_id "own" the cell?
+                if (cell.seq_id.size() == 1) { has_cell = true; }
+            }
+            if (!has_cell) {
+                llama_kv_cell & empty_cell = cells[next_empty_cell];
+                GGML_ASSERT(empty_cell.is_empty());
+                // copy old tail into the empty cell
+                if (seq_meta.tail >= 0) {
+                    llama_kv_cell & orig_cell = cells[seq_meta.tail];
+                    empty_cell.pos = orig_cell.pos;
+                    empty_cell.src = orig_cell.src;
+                    orig_cell.seq_id.erase(seq_id);
+                    empty_cell.seq_id.insert(seq_id); // will be overwritten
+                }
+                seq_meta.tail = next_empty_cell;
+                // find next empty cell
+                if (s + 1 < n_seqs) {
+                    next_empty_cell += 1;
+                    for (uint32_t i = 0; i < size; ++i) {
+                        if (next_empty_cell >= size) { next_empty_cell -= size; }
+                        llama_kv_cell & cell = cells[next_empty_cell];
+                        if (cell.is_empty()) { break; }
+                        next_empty_cell += 1;
+                    }
+                }
+            }
+            if (min > seq_meta.tail) { min = seq_meta.tail; }
+            if (max < seq_meta.tail) { max = seq_meta.tail; }
+        }
+
+        // gather and re-order
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            int32_t dst_id = s + min;
+            int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+            if (dst_id != src_id) {
+                llama_kv_cell & dst_cell = cells[dst_id];
+                llama_kv_cell & src_cell = cells[src_id];
+
+                std::swap(dst_cell.pos, src_cell.pos);
+                std::swap(dst_cell.src, src_cell.src);
+                std::swap(dst_cell.seq_id, src_cell.seq_id);
+
+                // swap tails (assuming they NEVER overlap)
+                for (const llama_seq_id seq_id : src_cell.seq_id) {
+                    cells[seq_id].tail = src_id;
+                }
+                for (const llama_seq_id seq_id : dst_cell.seq_id) {
+                    cells[seq_id].tail = dst_id;
+                }
+            }
+        }
+
+        // update the pos of the used seqs
+        for (uint32_t s = 0; s < n_seqs; ++s) {
+            const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+            int32_t cell_id = s + min;
+            llama_kv_cell & cell = cells[cell_id];
+
+            if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
+                // What should happen when the pos backtracks or skips a value?
+                // Clearing the state mid-batch would require special-casing which isn't done.
+                LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
+                    __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+            }
+            cell.pos = last_pos;
+            cell.seq_id.clear();
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
+                const llama_seq_id seq_id = ubatch.seq_id[s][j];
+                cell.seq_id.insert(seq_id);
+                cells[seq_id].tail = cell_id;
+            }
+        }
+
+        // allow getting the range of used cells, from head to head + n
+        head = min;
+        n    = max - min + 1;
+        used = std::count_if(cells.begin(), cells.end(),
+            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
+
+        // sanity check
+        return n >= n_seqs;
+    }
+
+    // otherwise, one cell per token.
+
+    if (n_tokens > size) {
+        LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size);
+        return false;
+    }
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (head + n_tokens > size) {
+            n_tested += size - head;
+            head = 0;
+            continue;
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            if (cells[head + i].pos >= 0) {
+                found = false;
+                head     += i + 1;
+                n_tested += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= size) {
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return false;
+        }
+    }
+
+    for (uint32_t s = 0; s < n_seqs; s++) {
+        for (uint32_t i = 0; i < n_seq_tokens; ++i) {
+            uint32_t k = s*n_seq_tokens + i;
+            cells[head + k].pos = ubatch.pos[k];
+
+            for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
+                cells[head + k].seq_id.insert(ubatch.seq_id[s][j]);
+            }
+        }
+    }
+
+    used += n_tokens;
+
+    pending.ranges.push_back({head, head + n_tokens});
+
+    return true;
+}
+
+uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const {
+    // the FA kernels require padding to avoid extra runtime boundary checks
+    return cparams.flash_attn ? 256u : 32u;
+}
+
+uint32_t llama_kv_cache_unified::cell_max() const {
+    for (uint32_t i = size; i > 0; --i) {
+        const llama_kv_cell & cell = cells[i - 1];
+
+        if (cell.pos >= 0 && !cell.is_empty()) {
+            return i;
+        }
+    }
+
+    return 0;
+}
+
+size_t llama_kv_cache_unified::size_k_bytes() const {
+    size_t size_k_bytes = 0;
+
+    for (const auto & k : k_l) {
+        size_k_bytes += ggml_nbytes(k);
+    }
+
+    return size_k_bytes;
+}
+
+size_t llama_kv_cache_unified::size_v_bytes() const {
+    size_t size_v_bytes = 0;
+
+    for (const auto & v : v_l) {
+        size_v_bytes += ggml_nbytes(v);
+    }
+
+    return size_v_bytes;
+}
+
+bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
+    const uint32_t n_layer = hparams.n_layer;
+
+    const uint32_t n_kv   = cell_max();
+    const uint32_t n_used = used;
+
+    assert(n_used <= n_kv);
+
+    //const int64_t t_start = ggml_time_us();
+
+    // number of cells moved
+    uint32_t n_moves = 0;
+
+    // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
+    //   - source view, destination view, copy operation
+    //   - x2 for keys and values
+    //const uint32_t max_moves = max_nodes()/(6*n_layer);
+    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
+    const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
+
+    // determine which KV cells to move where
+    //
+    //  cell i moves to ids[i]
+    //
+    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
+    //
+    auto & ids = defrag_info.ids;
+
+    ids.clear();
+    ids.resize(n_kv, n_kv);
+
+    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
+        const auto & cell0 = cells[i0];
+
+        if (!cell0.is_empty()) {
+            ids[i0] = i0;
+
+            continue;
+        }
+
+        // found a hole - fill it with data from the end of the cache
+
+        uint32_t nh = 1;
+
+        // determine the size of the hole
+        while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
+            nh++;
+        }
+
+        uint32_t nf = 0;
+        uint32_t is = n_kv - 1;
+
+        // starting from the end, find nh non-empty cells
+        for (; is > i0; --is) {
+            const auto & cell1 = cells[is];
+
+            if (cell1.is_empty() || ids[is] != n_kv) {
+                continue;
+            }
+
+            // non-empty cell which is not yet moved
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        // this can only happen if `n_used` is not accurate, which would be a bug
+        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
+
+        nf = 0;
+
+        uint32_t i1 = is;
+
+        // are we moving a continuous block of memory?
+        bool cont = false;
+
+        // should we stop searching for the next move?
+        bool stop = false;
+
+        // go back and move the nf cells to the hole
+        for (; i1 < n_kv; ++i1) {
+            auto & cell1 = cells[i1];
+
+            if (cell1.is_empty() || ids[i1] != n_kv) {
+                if (n_moves == max_moves) {
+                    stop = true;
+                    break;
+                }
+
+                cont = false;
+                continue;
+            }
+
+            // this cell goes to (i0 + nf)
+            ids[i1] = i0 + nf;
+
+            // move the cell meta data
+            cells[i0 + nf] = cell1;
+
+            // clear the old cell and move the head there
+            cell1 = llama_kv_cell();
+            head = n_used;
+
+            if (!cont) {
+                n_moves++;
+                cont = true;
+            }
+
+            nf++;
+
+            if (nf == nh) {
+                break;
+            }
+        }
+
+        if (stop || n_moves == max_moves) {
+            break;
+        }
+
+        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
+
+        i0 += nh - 1;
+    }
+
+    if (n_moves == 0) {
+        return false;
+    }
+
+    LLAMA_LOG_DEBUG("(tmp log) KV defrag cell moves: %u\n", n_moves);
+
+    LLAMA_LOG_DEBUG("expected gf nodes: %u\n", 6*n_moves*n_layer);
+
+    return true;
+}
+
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
+    uint32_t cell_count = 0;
+
+    // Count the number of cells with the specified seq_id
+    // Find all the ranges of cells with this seq id (or all, when -1)
+    uint32_t cell_range_begin = size;
+    for (uint32_t i = 0; i < size; ++i) {
+        const auto & cell = cells[i];
+        if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
+            ++cell_count;
+            if (cell_range_begin == size) {
+                cell_range_begin = i;
+            }
+        } else {
+            if (cell_range_begin != size) {
+                cell_ranges.emplace_back(cell_range_begin, i);
+                cell_range_begin = size;
+            }
+        }
+    }
+    if (cell_range_begin != size) {
+        cell_ranges.emplace_back(cell_range_begin, size);
+    }
+
+    // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
+    uint32_t cell_count_check = 0;
+    for (const auto & range : cell_ranges) {
+        cell_count_check += range.second - range.first;
+    }
+    GGML_ASSERT(cell_count == cell_count_check);
+
+    io.write(&cell_count, sizeof(cell_count));
+
+    state_write_meta(io, cell_ranges, seq_id);
+    state_write_data(io, cell_ranges);
+}
+
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    uint32_t cell_count;
+    io.read_to(&cell_count, sizeof(cell_count));
+
+    bool res = true;
+    res = res && state_read_meta(io, cell_count, seq_id);
+    res = res && state_read_data(io, cell_count);
+
+    if (!res) {
+        if (seq_id == -1) {
+            clear();
+        } else {
+            seq_rm(seq_id, -1, -1);
+        }
+        throw std::runtime_error("failed to restore kv cache");
+    }
+}
+
+void llama_kv_cache_unified::state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id) const {
+    for (const auto & range : cell_ranges) {
+        for (uint32_t i = range.first; i < range.second; ++i) {
+            const auto & cell = cells[i];
+            const llama_pos pos      = cell.pos;
+            const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
+
+            io.write(&pos,      sizeof(pos));
+            io.write(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id) {
+                for (auto seq_id : cell.seq_id) {
+                    io.write(&seq_id, sizeof(seq_id));
+                }
+            }
+        }
+    }
+}
+
+void llama_kv_cache_unified::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
+    const uint32_t v_trans = this->v_trans ? 1 : 0;
+    const uint32_t n_layer = hparams.n_layer;
+
+    io.write(&v_trans, sizeof(v_trans));
+    io.write(&n_layer, sizeof(n_layer));
+
+    std::vector<uint8_t> tmp_buf;
+
+    // Iterate and write all the keys first, each row is a cell
+    // Get whole range at a time
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Write key type
+        const int32_t k_type_i = (int32_t)k_l[il]->type;
+        io.write(&k_type_i, sizeof(k_type_i));
+
+        // Write row size of key
+        const uint64_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        io.write(&k_size_row, sizeof(k_size_row));
+
+        // Read each range of cells of k_size length each into tmp_buf and write out
+        for (const auto & range : cell_ranges) {
+            const size_t range_size = range.second - range.first;
+            const size_t buf_size = range_size * k_size_row;
+            io.write_tensor(k_l[il], range.first * k_size_row, buf_size);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write row size of value
+            const uint64_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            io.write(&v_size_row, sizeof(v_size_row));
+
+            // Read each range of cells of v_size length each into tmp_buf and write out
+            for (const auto & range : cell_ranges) {
+                const size_t range_size = range.second - range.first;
+                const size_t buf_size = range_size * v_size_row;
+                io.write_tensor(v_l[il], range.first * v_size_row, buf_size);
+            }
+        }
+    } else {
+        // When v is transposed, we also need the element size and get the element ranges from each row
+        const uint32_t kv_size = size;
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Write value type
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            io.write(&v_type_i, sizeof(v_type_i));
+
+            // Write element size
+            const uint32_t v_size_el = ggml_type_size(v_l[il]->type);
+            io.write(&v_size_el, sizeof(v_size_el));
+
+            // Write GQA embedding size
+            io.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
+
+            // For each row, we get the element values of each cell
+            for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                for (const auto & range : cell_ranges) {
+                    const size_t range_size = range.second - range.first;
+                    const size_t src_offset = (range.first + j * kv_size) * v_size_el;
+                    const size_t buf_size = range_size * v_size_el;
+                    io.write_tensor(v_l[il], src_offset, buf_size);
+                }
+            }
+        }
+    }
+}
+
+bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
+    if (dest_seq_id != -1) {
+        // single sequence
+
+        seq_rm(dest_seq_id, -1, -1);
+
+        llama_sbatch sbatch;
+        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+
+        batch.n_tokens = cell_count;
+        batch.n_seq_tokens = cell_count;
+        batch.n_seqs = 1;
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_pos pos;
+            uint32_t n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            if (n_seq_id != 0) {
+                LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
+                return false;
+            }
+
+            batch.pos[i] = pos;
+        }
+        batch.n_seq_id[0] = 1;
+        batch.seq_id[0] = &dest_seq_id;
+        if (!find_slot(batch)) {
+            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            return false;
+        }
+        commit();
+
+        // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
+        // Assume that this is one contiguous block of cells
+        GGML_ASSERT(head + cell_count <= size);
+        GGML_ASSERT(cells[head].pos == batch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
+        GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
+    } else {
+        // whole KV cache restore
+
+        if (cell_count > size) {
+            LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
+            return false;
+        }
+
+        clear();
+
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            llama_kv_cell & cell = cells[i];
+
+            llama_pos pos;
+            uint32_t  n_seq_id;
+
+            io.read_to(&pos,      sizeof(pos));
+            io.read_to(&n_seq_id, sizeof(n_seq_id));
+
+            cell.pos = pos;
+
+            for (uint32_t j = 0; j < n_seq_id; ++j) {
+                llama_seq_id seq_id;
+                io.read_to(&seq_id, sizeof(seq_id));
+
+                // TODO: llama_kv_cache_unified should have a notion of max sequences
+                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
+                if (seq_id < 0) {
+                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                    return false;
+                }
+
+                cell.seq_id.insert(seq_id);
+
+                if (recurrent) {
+                    int32_t & tail = cells[seq_id].tail;
+                    if (tail != -1) {
+                        LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
+                        return false;
+                    }
+                    tail = i;
+                }
+            }
+        }
+
+        head = 0;
+        used = cell_count;
+    }
+
+    if (recurrent) {
+        for (uint32_t i = 0; i < cell_count; ++i) {
+            uint32_t cell_id = head + i;
+            // make sure the recurrent states will keep their restored state
+            cells[cell_id].src = cell_id;
+        }
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
+    uint32_t v_trans;
+    uint32_t n_layer;
+    io.read_to(&v_trans, sizeof(v_trans));
+    io.read_to(&n_layer, sizeof(n_layer));
+
+    if (n_layer != hparams.n_layer) {
+        LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
+        return false;
+    }
+    if (cell_count > size) {
+        LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, size);
+        return false;
+    }
+    if (v_trans != (bool) v_trans) {
+        LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
+        return false;
+    }
+
+    // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
+
+        // Read type of key
+        int32_t k_type_i_ref;
+        io.read_to(&k_type_i_ref, sizeof(k_type_i_ref));
+        const int32_t k_type_i = (int32_t) k_l[il]->type;
+        if (k_type_i != k_type_i_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
+            return false;
+        }
+
+        // Read row size of key
+        uint64_t k_size_row_ref;
+        io.read_to(&k_size_row_ref, sizeof(k_size_row_ref));
+        const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
+        if (k_size_row != k_size_row_ref) {
+            LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
+            return false;
+        }
+
+        if (cell_count) {
+            // Read and set the keys for the whole cell range
+            ggml_backend_tensor_set(k_l[il], io.read(cell_count * k_size_row), head * k_size_row, cell_count * k_size_row);
+        }
+    }
+
+    if (!v_trans) {
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read row size of value
+            uint64_t v_size_row_ref;
+            io.read_to(&v_size_row_ref, sizeof(v_size_row_ref));
+            const size_t v_size_row = ggml_row_size(v_l[il]->type, n_embd_v_gqa);
+            if (v_size_row != v_size_row_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // Read and set the values for the whole cell range
+                ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_row), head * v_size_row, cell_count * v_size_row);
+            }
+        }
+    } else {
+        // For each layer, read the values for each cell (transposed)
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
+
+            // Read type of value
+            int32_t v_type_i_ref;
+            io.read_to(&v_type_i_ref, sizeof(v_type_i_ref));
+            const int32_t v_type_i = (int32_t)v_l[il]->type;
+            if (v_type_i != v_type_i_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
+                return false;
+            }
+
+            // Read element size of value
+            uint32_t v_size_el_ref;
+            io.read_to(&v_size_el_ref, sizeof(v_size_el_ref));
+            const size_t v_size_el = ggml_type_size(v_l[il]->type);
+            if (v_size_el != v_size_el_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
+                return false;
+            }
+
+            // Read GQA embedding size
+            uint32_t n_embd_v_gqa_ref;
+            io.read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
+            if (n_embd_v_gqa != n_embd_v_gqa_ref) {
+                LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
+                return false;
+            }
+
+            if (cell_count) {
+                // For each row in the transposed matrix, read the values for the whole cell range
+                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
+                    const size_t dst_offset = (head + j * size) * v_size_el;
+                    ggml_backend_tensor_set(v_l[il], io.read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+//
+// kv cache view
+//
+
+llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max) {
+    llama_kv_cache_view result = {
+        /*.n_cells            = */ 0,
+        /*.n_seq_max          = */ n_seq_max,
+        /*.token_count        = */ 0,
+        /*.used_cells         = */ kv.get_used_cells(),
+        /*.max_contiguous     = */ 0,
+        /*.max_contiguous_idx = */ -1,
+        /*.cells              = */ nullptr,
+        /*.cells_sequences    = */ nullptr,
+    };
+
+    return result;
+}
+
+void llama_kv_cache_view_free(llama_kv_cache_view * view) {
+    if (view->cells != nullptr) {
+        free(view->cells);
+        view->cells = nullptr;
+    }
+    if (view->cells_sequences != nullptr) {
+        free(view->cells_sequences);
+        view->cells_sequences = nullptr;
+    }
+}
+
+void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv) {
+    // TODO: rework this in the future, for now quick hack
+    const llama_kv_cache_unified * kvu = dynamic_cast<const llama_kv_cache_unified *>(kv);
+    if (kvu == nullptr) {
+        LLAMA_LOG_ERROR("%s: the kv_cache_view currently works only with llama_kv_cache_unified\n", __func__);
+        return;
+    }
+
+    if (uint32_t(view->n_cells) < kvu->size || view->cells == nullptr) {
+        view->n_cells = int32_t(kvu->size);
+        void * p = realloc(view->cells, sizeof(llama_kv_cache_view_cell) * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
+        view->cells = (llama_kv_cache_view_cell *)p;
+        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
+        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
+        view->cells_sequences = (llama_seq_id *)p;
+    }
+
+    const std::vector<llama_kv_cell> & kv_cells = kvu->cells;
+    llama_kv_cache_view_cell * c_curr = view->cells;
+    llama_seq_id * cs_curr = view->cells_sequences;
+    int32_t used_cells = 0;
+    int32_t token_count = 0;
+    int32_t curr_contig_idx = -1;
+    uint32_t max_contig = 0;
+    int32_t max_contig_idx = -1;
+
+    for (int32_t i = 0; i < int32_t(kvu->size); i++, c_curr++, cs_curr += view->n_seq_max) {
+        const size_t curr_size = kv_cells[i].seq_id.size();
+        token_count += curr_size;
+        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
+
+        if (curr_size > 0) {
+            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
+                max_contig = i - curr_contig_idx;
+                max_contig_idx = curr_contig_idx;
+            }
+            curr_contig_idx = -1;
+        } else if (curr_contig_idx < 0) {
+            curr_contig_idx = i;
+        }
+
+        int seq_idx = 0;
+        for (const llama_seq_id it : kv_cells[i].seq_id) {
+            if (seq_idx >= view->n_seq_max) {
+                break;
+            }
+            cs_curr[seq_idx] = it;
+            seq_idx++;
+        }
+        if (seq_idx != 0) {
+            used_cells++;
+        }
+        for (; seq_idx < view->n_seq_max; seq_idx++) {
+            cs_curr[seq_idx] = -1;
+        }
+    }
+    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
+        max_contig_idx = curr_contig_idx;
+        max_contig = kv_cells.size() - curr_contig_idx;
+    }
+    view->max_contiguous = max_contig;
+    view->max_contiguous_idx = max_contig_idx;
+    view->token_count = token_count;
+    view->used_cells = used_cells;
+    if (uint32_t(used_cells) != kvu->used) {
+        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
+            __func__, kvu->used, used_cells);
+    }
+}
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index bd9e6da8832b7..2dabef1c9b0c3 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -676,7 +676,7 @@ llama_model_loader::llama_model_loader(
             }
         }
 
-        LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        //LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(meta.get(), i);
@@ -693,7 +693,7 @@ llama_model_loader::llama_model_loader(
             }
             replace_all(value, "\n", "\\n");
 
-            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
@@ -702,7 +702,7 @@ llama_model_loader::llama_model_loader(
                 continue;
             }
 
-            LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -1153,6 +1153,7 @@ std::string llama_model_loader::ftype_name() const {
 }
 
 void llama_model_loader::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
     LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
     if (n_bytes < GiB) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a5853f8b12dc0..14f408e7ee8f2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1560,12 +1560,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
         const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
+            //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
         }
         const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
         auto * dev = devices.at(layer_gpu);
-        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
+        //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
         return {dev, &pimpl->gpu_buft_list.at(dev)};
     };
 
@@ -4445,6 +4445,7 @@ uint64_t llama_model::n_elements() const {
 }
 
 void llama_model::print_info() const {
+    return;
     const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d90f1d6b1ea63..dc53b918e12c4 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1998,8 +1998,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else {
                 // token is control, but not marked as EOG -> print a debug log
                 if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                            __func__, t.second, t.first.c_str());
+                    //LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                    //        __func__, t.second, t.first.c_str());
                 }
             }
         }
@@ -2768,6 +2768,7 @@ int32_t llama_vocab::impl::detokenize(
 }
 
 void llama_vocab::impl::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
@@ -3114,6 +3115,7 @@ std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, boo
 }
 
 void llama_vocab::print_info() const {
+    return;
     pimpl->print_info();
 }
 
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
index c65afd61e023c..e901eea61ba60 100644
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@@ -984,6 +984,34 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
     response += piece;
 }
 
+static long long timer_start = 0;
+static long long timer_total = 0;
+static long long timer_count = 0;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
+
+  printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed);
+}
+
+
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
     const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
@@ -993,10 +1021,15 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         return 1;
     }
 
+      int cr = atexit(show_timer);
+      assert(cr == 0);
+
     // prepare a batch for the prompt
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
+
     while (true) {
+        start_timer();
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
             printe("failed to decode\n");
@@ -1018,6 +1051,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
+	stop_timer();
     }
 
     printf(LOG_COL_DEFAULT);