From 74eaeb89e515df5919392373273be1e24db90abb Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 9 Apr 2025 13:25:54 +0200
Subject: [PATCH 001/117] src: reduce the logging

---
 src/llama-kv-cache.cpp     | 2 ++
 src/llama-model-loader.cpp | 2 +-
 src/llama-vocab.cpp        | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index dbf5f1187d9e5..04d593ce21477 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -86,8 +86,10 @@ bool llama_kv_cache_unified::init(
             buft = ggml_backend_cpu_buffer_type();
         }
 
+	/*
         LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__,
                 i, n_embd_k_gqa, n_embd_v_gqa, dev_name);
+	*/
 
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ea73a8a7ba944..36f8d1cbf0323 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -668,7 +668,7 @@ llama_model_loader::llama_model_loader(
             }
             replace_all(value, "\n", "\\n");
 
-            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 0feabd95aaf2b..a9c24e78812ac 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1974,8 +1974,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else {
                 // token is control, but not marked as EOG -> print a debug log
                 if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
-                    LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
-                            __func__, t.second, t.first.c_str());
+                    //LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+                    //        __func__, t.second, t.first.c_str());
                 }
             }
         }

From 41846f348b920d825a93a8e00316165b7c07d685 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 9 Apr 2025 13:26:56 +0200
Subject: [PATCH 002/117] Add helper scripts

---
 prepare.remoting.sh |  6 ++++++
 prepare.sh          |  1 +
 prepare.vulkan.sh   |  1 +
 run.remoting.sh     | 16 ++++++++++++++++
 run.sh              |  1 +
 run.vulkan.sh       |  1 +
 6 files changed, 26 insertions(+)
 create mode 100755 prepare.remoting.sh
 create mode 100644 prepare.sh
 create mode 100644 prepare.vulkan.sh
 create mode 100755 run.remoting.sh
 create mode 100755 run.sh
 create mode 100755 run.vulkan.sh

diff --git a/prepare.remoting.sh b/prepare.remoting.sh
new file mode 100755
index 0000000000000..aebb75c031422
--- /dev/null
+++ b/prepare.remoting.sh
@@ -0,0 +1,6 @@
+cmake -S . -B ../build.remoting-frontend \
+      -DGGML_REMOTINGFRONTEND=ON \
+      -DGGML_CPU_ARM_ARCH=native \
+      -DGGML_NATIVE=OFF \
+      -DCMAKE_BUILD_TYPE=Debug \
+      "$@"
diff --git a/prepare.sh b/prepare.sh
new file mode 100644
index 0000000000000..2fb46cefd426c
--- /dev/null
+++ b/prepare.sh
@@ -0,0 +1 @@
+cmake -S . -B ./build -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF #-DCMAKE_BUILD_TYPE=Debug #-DGGML_VULKAN_DEBUG=1
diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh
new file mode 100644
index 0000000000000..29d0794ebe4e3
--- /dev/null
+++ b/prepare.vulkan.sh
@@ -0,0 +1 @@
+cmake -S . -B ../build.vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF
diff --git a/run.remoting.sh b/run.remoting.sh
new file mode 100755
index 0000000000000..c6fbdaac435a5
--- /dev/null
+++ b/run.remoting.sh
@@ -0,0 +1,16 @@
+#! /bin/bash
+
+if [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+else
+    prefix=""
+fi
+
+MODEL="$HOME/models/llama3.2"
+PROMPT="say nothing"
+$prefix \
+    ../build.remoting-frontend/bin/llama-run \
+    --ngl 99 \
+    --verbose \
+    "$MODEL" \
+    "$PROMPT"
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000000000..13d8c042515f0
--- /dev/null
+++ b/run.sh
@@ -0,0 +1 @@
+./build/bin/llama-run --ngl 999 --verbose ~/models/llama3.2 "say nothing"
diff --git a/run.vulkan.sh b/run.vulkan.sh
new file mode 100755
index 0000000000000..7f44334290bbf
--- /dev/null
+++ b/run.vulkan.sh
@@ -0,0 +1 @@
+../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing"

From ee79e12d0a40ef5f5a7e714248746f4c42ea475b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 9 Apr 2025 13:26:46 +0200
Subject: [PATCH 003/117] build-system: integrate the Remoting Frontend backend
 build

---
 CMakePresets.json       | 1 +
 Makefile                | 8 ++++++++
 ggml/CMakeLists.txt     | 2 ++
 ggml/src/CMakeLists.txt | 1 +
 4 files changed, 12 insertions(+)

diff --git a/CMakePresets.json b/CMakePresets.json
index 13bdd7907ab40..c5369a47f6bf9 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -30,6 +30,7 @@
     { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
+    { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,
diff --git a/Makefile b/Makefile
index 1f9455eff0aec..ebf9f79ed5598 100644
--- a/Makefile
+++ b/Makefile
@@ -716,6 +716,11 @@ ggml/src/ggml-cuda/ggml-cuda.o: \
 	$(NVCC_COMPILE)
 endif # GGML_CUDA
 
+ifdef GGML_REMOTING_FRONTEND
+	MK_CPPFLAGS  += -DGGML_USE_REMOTINGFRONTEND
+	OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o
+endif
+
 ifdef GGML_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
@@ -755,6 +760,9 @@ _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 
+ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(_ggml_vk_header): $(_ggml_vk_source)
 
 $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index d33f843b417cf..24c47aea122a2 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -179,6 +179,7 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug in
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
@@ -269,6 +270,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f00700da71fcd..76c3f3d27fc16 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -309,6 +309,7 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(RemotingFrontend)
 ggml_add_backend(OpenCL)
 
 foreach (target ggml-base ggml)

From 1bedad3cc38c5481b20a280f14c56565cdef677d Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 9 Apr 2025 13:27:51 +0200
Subject: [PATCH 004/117] ggml: ggml-remotingfrontend: stubs of a new backend

---
 ggml/include/ggml-remoting-frontend.h         |  16 +
 ggml/src/ggml-backend-reg.cpp                 |   8 +
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |  20 +
 .../ggml-remoting-frontend.cpp                | 499 ++++++++++++++++++
 4 files changed, 543 insertions(+)
 create mode 100644 ggml/include/ggml-remoting-frontend.h
 create mode 100644 ggml/src/ggml-remotingfrontend/CMakeLists.txt
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp

diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h
new file mode 100644
index 0000000000000..c32c283820dea
--- /dev/null
+++ b/ggml/include/ggml-remoting-frontend.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_NAME "RemotingFrontend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..8ed3c36362bcd 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -45,6 +45,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGFRONTEND
+#include "ggml-remoting-frontend.h"
+#endif
+
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -172,6 +176,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_REMOTINGFRONTEND
+        register_backend(ggml_backend_remoting_reg());
+#endif
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
@@ -575,6 +582,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("rpc", silent, dir_path);
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
+    ggml_backend_load_best("remoting_frontend", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
new file mode 100644
index 0000000000000..4ab2aaa0ac340
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+# function(detect_host_compiler)
+#     find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH)
+#     find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
+
+#     set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE)
+#     set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
+# endfunction()
+
+message(STATUS "Enable API Remoting frontend found")
+
+ggml_add_backend_library(ggml-remotingfrontend
+                         ggml-remoting-frontend.cpp
+                         ../../include/ggml-remoting-frontend.h
+                        )
+
+#target_link_libraries(ggml-remotingfrontend PRIVATE remotingfrontend)
+target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
new file mode 100644
index 0000000000000..4c7c1f1dc8f95
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
@@ -0,0 +1,499 @@
+#include "ggml-remoting-frontend.h"
+
+#include <ostream>
+#include <iostream>
+#include <mutex>
+#include <memory>
+#include <chrono>
+#include <thread>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
+
+#define UNUSED GGML_UNUSED
+
+int ggml_backend_remoting_get_device_count();
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
+
+static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+
+
+struct ggml_backend_remoting_buffer_type_context {
+    std::string name;
+};
+
+struct remoting_context_struct {
+   int i;
+};
+typedef std::shared_ptr<remoting_context_struct> remoting_context;
+typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return ggml_backend_remoting_get_device_count();
+}
+
+static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return GGML_REMOTING_NAME;
+}
+
+struct ggml_backend_remoting_device_context {
+    size_t device;
+    std::string name;
+    std::string description;
+};
+
+static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return "API Remoting";
+}
+
+static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return "API Remoting device";
+}
+
+static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    UNUSED(device);
+    *total = 1024*1024*1024;
+    *free = *total;
+}
+
+struct remoting_device_struct {
+    std::mutex mutex;
+};
+
+struct remoting_device_struct;
+typedef std::shared_ptr<remoting_device_struct> remoting_device;
+typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
+
+struct remoting_buffer_struct;
+typedef std::shared_ptr<remoting_buffer_struct> remoting_buffer;
+typedef std::weak_ptr<remoting_buffer_struct> remoting_buffer_ref;
+
+// vk buffer type
+static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+
+    return "Remoting buffer";
+}
+
+static void ggml_remoting_destroy_buffer(remoting_buffer& buf) {
+    UNUSED(buf);
+}
+
+
+static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  UNUSED(dst);
+  UNUSED(c);
+  UNUSED(size);
+  UNUSED(offset);
+}
+
+static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  UNUSED(ctx);
+  UNUSED(dst);
+  UNUSED(c);
+  UNUSED(size);
+  UNUSED(offset);
+}
+
+
+static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) {
+    if (tensor->view_src) {
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
+    }
+    return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
+}
+
+struct ggml_backend_remoting_buffer_context {
+    remoting_device_ref device;
+    remoting_buffer dev_buffer;
+    std::string name;
+
+    ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
+        name(name) {
+        UNUSED(device);
+	UNUSED(dev_buffer);
+    }
+
+    ~ggml_backend_remoting_buffer_context() {
+        ggml_remoting_destroy_buffer(dev_buffer);
+    }
+};
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+    ggml_remoting_destroy_buffer(ctx->dev_buffer);
+    delete ctx;
+}
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *) 4096;
+
+    UNUSED(buffer);
+}
+
+static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    if (tensor->view_src != nullptr) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+    }
+    return GGML_STATUS_SUCCESS;
+}
+
+static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(value);
+  UNUSED(offset);
+  UNUSED(size);
+}
+
+static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) {
+    UNUSED(dst);
+    UNUSED(offset);
+    UNUSED(src);
+    UNUSED(size);
+}
+
+static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) {
+    UNUSED(src);
+    UNUSED(offset);
+    UNUSED(dst);
+    UNUSED(size);
+}
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+#if 0
+    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+    remoting_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+#else
+    UNUSED(buffer);
+    UNUSED(tensor);
+    UNUSED(data);
+    UNUSED(offset);
+    UNUSED(size);
+#endif
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+#if 0
+    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+
+    remoting_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+#else
+    UNUSED(buffer);
+    UNUSED(tensor);
+    UNUSED(data);
+    UNUSED(offset);
+    UNUSED(size);
+#endif
+}
+
+static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) {
+  UNUSED(ctx);
+  UNUSED(dst);
+  UNUSED(dst_offset);
+  UNUSED(src);
+  UNUSED(src_offset);
+  UNUSED(size);
+}
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  return true;
+
+  UNUSED(buffer);
+  UNUSED(src);
+  UNUSED(dst);
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+
+    ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
+}
+
+static ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_remoting_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
+
+
+    return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+    return 4096;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+    return 40960;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    UNUSED(buft);
+    UNUSED(tensor);
+    return ggml_nbytes(tensor);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+
+    static struct ggml_backend_buffer_type buft {
+      /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+      /* .device   = */ dev,
+      /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
+    };
+
+    return & buft;
+}
+
+static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+  UNUSED(dev);
+  UNUSED(op);
+
+  return true;
+}
+
+static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    UNUSED(dev);
+    UNUSED(buft);
+    return true;
+}
+
+
+static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+
+    UNUSED(dev);
+}
+
+static const char * ggml_backend_remoting_name(ggml_backend_t backend) {
+    UNUSED(backend);
+
+    return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    UNUSED(backend);
+    UNUSED(cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+    /* .get_name                = */ ggml_backend_remoting_name,
+    /* .free                    = */ ggml_backend_remoting_free,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_remoting_device_get_name(dev);
+    props->description = ggml_backend_remoting_device_get_description(dev);
+    props->type        = ggml_backend_remoting_device_get_type(dev);
+    ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ true,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
+
+static ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+    ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
+
+    ggml_backend_t remoting_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_remoting_guid(),
+        /* .interface = */ ggml_backend_remoting_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device),
+        /* .context   = */ ctx,
+    };
+
+    return remoting_backend;
+}
+
+// host buffer type
+
+static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_REMOTING_NAME "_Host";
+
+    UNUSED(buft);
+}
+
+static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+# if 0
+    ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
+#endif
+    UNUSED(buffer);
+}
+
+static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+
+    void *ptr = nullptr;
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
+
+    return buffer;
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+  return 4096;
+}
+
+// Should be changed to return device-specific host buffer type
+// but that probably requires changes in llama.cpp
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    // Make sure device 0 is initialized
+    //ggml_remoting_instance_init();
+    //ggml_remoting_get_device(0);
+
+    return &ggml_backend_remoting_buffer_type_host;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return ggml_backend_remoting_host_buffer_type();
+}
+
+static const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
+    /* .get_name             = */ ggml_backend_remoting_device_get_name,
+    /* .get_description      = */ ggml_backend_remoting_device_get_description,
+    /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+    /* .get_type             = */ ggml_backend_remoting_device_get_type,
+    /* .get_props            = */ ggml_backend_remoting_device_get_props,
+    /* .init_backend         = */ ggml_backend_remoting_device_init,
+    /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+    /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
+                ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
+                char desc[256] = "API Remoting device";
+
+                ctx->device = i;
+                ctx->name = GGML_REMOTING_NAME + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_remoting_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
+        }
+    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+int ggml_backend_remoting_get_device_count() {
+    return 1;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_remoting_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_remoting_reg_i,
+        /* .context     = */ nullptr,
+    };
+
+    RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
+    return &reg;
+}

From cd5410fe75c3186885fea7d7464c40153f062eb5 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 9 Apr 2025 13:47:19 +0200
Subject: [PATCH 005/117] .github: remove

---
 .../ISSUE_TEMPLATE/010-bug-compilation.yml    |   87 -
 .github/ISSUE_TEMPLATE/011-bug-results.yml    |  101 -
 .github/ISSUE_TEMPLATE/019-bug-misc.yml       |   91 -
 .github/ISSUE_TEMPLATE/020-enhancement.yml    |   51 -
 .github/ISSUE_TEMPLATE/030-research.yml       |   52 -
 .github/ISSUE_TEMPLATE/040-refactor.yml       |   28 -
 .github/ISSUE_TEMPLATE/config.yml             |   11 -
 .github/actions/windows-setup-curl/action.yml |   25 -
 .github/labeler.yml                           |   86 -
 .github/pull_request_template.md              |    1 -
 .github/workflows/bench.yml.disabled          |  304 ---
 .github/workflows/build-linux-cross.yml       |  124 --
 .github/workflows/build.yml                   | 1797 -----------------
 .github/workflows/close-issue.yml             |   28 -
 .github/workflows/docker.yml                  |  175 --
 .github/workflows/editorconfig.yml            |   29 -
 .github/workflows/gguf-publish.yml            |   44 -
 .github/workflows/labeler.yml                 |   17 -
 .../workflows/python-check-requirements.yml   |   33 -
 .github/workflows/python-lint.yml             |   30 -
 .github/workflows/python-type-check.yml       |   40 -
 .github/workflows/server.yml                  |  237 ---
 22 files changed, 3391 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/020-enhancement.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/030-research.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/040-refactor.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/config.yml
 delete mode 100644 .github/actions/windows-setup-curl/action.yml
 delete mode 100644 .github/labeler.yml
 delete mode 100644 .github/pull_request_template.md
 delete mode 100644 .github/workflows/bench.yml.disabled
 delete mode 100644 .github/workflows/build-linux-cross.yml
 delete mode 100644 .github/workflows/build.yml
 delete mode 100644 .github/workflows/close-issue.yml
 delete mode 100644 .github/workflows/docker.yml
 delete mode 100644 .github/workflows/editorconfig.yml
 delete mode 100644 .github/workflows/gguf-publish.yml
 delete mode 100644 .github/workflows/labeler.yml
 delete mode 100644 .github/workflows/python-check-requirements.yml
 delete mode 100644 .github/workflows/python-lint.yml
 delete mode 100644 .github/workflows/python-type-check.yml
 delete mode 100644 .github/workflows/server.yml

diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
deleted file mode 100644
index b85bf5741e5a3..0000000000000
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-name: Bug (compilation)
-description: Something goes wrong when trying to compile llama.cpp.
-title: "Compile bug: "
-labels: ["bug-unconfirmed", "compilation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the compilation of llama.cpp fails.
-        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
-        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
-        by clearing `~/.cache/ccache` (on Linux).
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        84a07a17b1b08cf2b9747c633a2372782848a27f
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
-      placeholder: >
-        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Compile command
-      description: >
-        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml
deleted file mode 100644
index 1ccef0793d45e..0000000000000
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ /dev/null
@@ -1,101 +0,0 @@
-name: Bug (model use)
-description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
-title: "Eval bug: "
-labels: ["bug-unconfirmed", "model evaluation"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for bug reports where the model evaluation results
-        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-        The `llama-cli` binary can be used for simple and reproducible model inference.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software are you running? (use `--version` to get a version string)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
-        multiple: true
-    validations:
-      required: true
-  - type: textarea
-    id: hardware
-    attributes:
-      label: Hardware
-      description: Which CPUs/GPUs are you using?
-      placeholder: >
-        e.g. Ryzen 5950X + 2x RTX 4090
-    validations:
-      required: true
-  - type: textarea
-    id: model
-    attributes:
-      label: Models
-      description: >
-        Which model(s) at which quantization were you using when encountering the bug?
-        If you downloaded a GGUF file off of Huggingface, please provide a link.
-      placeholder: >
-        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it.
-        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
-        that information would be very much appreciated by us.
-      placeholder: >
-        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
-        When I use -ngl 0 it works correctly.
-        Here are the exact commands that I used: ...
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: true
diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
deleted file mode 100644
index 1904e31fdc436..0000000000000
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-name: Bug (misc.)
-description: Something is not working the way it should (and it's not covered by any of the above cases).
-title: "Misc. bug: "
-labels: ["bug-unconfirmed"]
-body:
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for taking the time to fill out this bug report!
-        This issue template is intended for miscellaneous bugs that don't fit into any other category.
-        If you encountered the issue while using an external UI (e.g. ollama),
-        please reproduce your issue using one of the examples/binaries in this repository.
-  - type: textarea
-    id: version
-    attributes:
-      label: Name and Version
-      description: Which version of our software is affected? (You can use `--version` to get a version string.)
-      placeholder: |
-        $./llama-cli --version
-        version: 2999 (42b4109e)
-        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
-    validations:
-      required: true
-  - type: dropdown
-    id: operating-system
-    attributes:
-      label: Operating systems
-      description: Which operating systems do you know to be affected?
-      multiple: true
-      options:
-        - Linux
-        - Mac
-        - Windows
-        - BSD
-        - Other? (Please let us know in description)
-    validations:
-      required: false
-  - type: dropdown
-    id: module
-    attributes:
-      label: Which llama.cpp modules do you know to be affected?
-      multiple: true
-      options:
-        - Documentation/Github
-        - libllama (core library)
-        - llama-cli
-        - llama-server
-        - llama-bench
-        - llama-quantize
-        - Python/Bash scripts
-        - Test code
-        - Other (Please specify in the next section)
-    validations:
-      required: false
-  - type: textarea
-    id: command
-    attributes:
-      label: Command line
-      description: >
-        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
-        This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
-  - type: textarea
-    id: info
-    attributes:
-      label: Problem description & steps to reproduce
-      description: >
-        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
-    validations:
-      required: true
-  - type: textarea
-    id: first_bad_commit
-    attributes:
-      label: First Bad Commit
-      description: >
-        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
-        If possible, please do a git bisect and identify the exact commit that introduced the bug.
-    validations:
-      required: false
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: >
-          If applicable, please copy and paste any relevant log output, including any generated text.
-          This will be automatically formatted into code, so no need for backticks.
-      render: shell
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml
deleted file mode 100644
index cee1446f5a097..0000000000000
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: Enhancement
-description: Used to request enhancements for llama.cpp.
-title: "Feature Request: "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
-
-  - type: checkboxes
-    id: prerequisites
-    attributes:
-      label: Prerequisites
-      description: Please confirm the following before submitting your enhancement request.
-      options:
-        - label: I am running the latest code. Mention the version if possible as well.
-          required: true
-        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
-          required: true
-        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
-          required: true
-        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
-          required: true
-
-  - type: textarea
-    id: feature-description
-    attributes:
-      label: Feature Description
-      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-      placeholder: Detailed description of the enhancement
-    validations:
-      required: true
-
-  - type: textarea
-    id: motivation
-    attributes:
-      label: Motivation
-      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-      placeholder: Explanation of why this feature is needed and its benefits
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-implementation
-    attributes:
-      label: Possible Implementation
-      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
-      placeholder: Detailed description of potential implementation
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml
deleted file mode 100644
index e774550d5908c..0000000000000
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: Research
-description: Track new technical research area.
-title: "Research: "
-labels: ["research 🔬"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
-
-  - type: checkboxes
-    id: research-stage
-    attributes:
-      label: Research Stage
-      description: Track general state of this research ticket
-      options:
-        - label: Background Research (Let's try to avoid reinventing the wheel)
-        - label: Hypothesis Formed (How do you think this will work and it's effect?)
-        - label: Strategy / Implementation Forming
-        - label: Analysis of results
-        - label: Debrief / Documentation (So people in the future can learn from us)
-
-  - type: textarea
-    id: background
-    attributes:
-      label: Previous existing literature and research
-      description: Whats the current state of the art and whats the motivation for this research?
-
-  - type: textarea
-    id: hypothesis
-    attributes:
-      label: Hypothesis
-      description: How do you think this will work and it's effect?
-
-  - type: textarea
-    id: implementation
-    attributes:
-      label: Implementation
-      description: Got an approach? e.g. a PR ready to go?
-
-  - type: textarea
-    id: analysis
-    attributes:
-      label: Analysis
-      description: How does the proposed implementation behave?
-
-  - type: textarea
-    id: logs
-    attributes:
-      label: Relevant log output
-      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
-      render: shell
diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml
deleted file mode 100644
index 2fe94e26c6988..0000000000000
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Refactor (Maintainers)
-description: Used to track refactoring opportunities.
-title: "Refactor: "
-labels: ["refactor"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
-
-  - type: textarea
-    id: background-description
-    attributes:
-      label: Background Description
-      description: Please provide a detailed written description of the pain points you are trying to solve.
-      placeholder: Detailed description behind your motivation to request refactor
-    validations:
-      required: true
-
-  - type: textarea
-    id: possible-approaches
-    attributes:
-      label: Possible Refactor Approaches
-      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
-      placeholder: Your idea of possible refactoring opportunity/approaches
-    validations:
-      required: false
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 0d246533c9515..0000000000000
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-blank_issues_enabled: true
-contact_links:
-  - name: Got an idea?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
-    about: Pop it there. It may then become an enhancement ticket.
-  - name: Got a question?
-    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
-    about: Ask a question there!
-  - name: Want to contribute?
-    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
-    about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml
deleted file mode 100644
index 5d76da3d79ac5..0000000000000
--- a/.github/actions/windows-setup-curl/action.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-name: 'Windows - Setup CURL'
-description: 'Composite action, to be reused in other workflow'
-inputs:
-  curl_version:
-    description: 'CURL version'
-    required: false
-    default: '8.6.0_6'
-outputs:
-  curl_path:
-    description: "Path to the downloaded libcurl"
-    value: ${{ steps.get_libcurl.outputs.curl_path }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: libCURL
-      id: get_libcurl
-      shell: powershell
-      env:
-        CURL_VERSION: ${{ inputs.curl_version }}
-      run: |
-        curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
-        mkdir $env:RUNNER_TEMP/libcurl
-        tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-        echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT
diff --git a/.github/labeler.yml b/.github/labeler.yml
deleted file mode 100644
index 1b47bc96885c4..0000000000000
--- a/.github/labeler.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# https://github.com/actions/labeler
-Kompute:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-kompute.h
-            - ggml/src/ggml-kompute/**
-            - README-kompute.md
-Apple Metal:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-metal.h
-            - ggml/src/ggml-metal/**
-            - README-metal.md
-SYCL:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-sycl.h
-            - ggml/src/ggml-sycl/**
-            - docs/backend/SYCL.md
-            - examples/sycl/**
-Nvidia GPU:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-cuda.h
-            - ggml/src/ggml-cuda/**
-Vulkan:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-vulkan.h
-            - ggml/src/ggml-vulkan/**
-documentation:
-    - changed-files:
-        - any-glob-to-any-file:
-            - docs/**
-            - media/**
-testing:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tests/**
-build:
-    - changed-files:
-        - any-glob-to-any-file:
-            - cmake/**
-            - CMakeLists.txt
-            - CMakePresets.json
-examples:
-    - changed-files:
-        - any-glob-to-any-file: examples/**
-devops:
-    - changed-files:
-        - any-glob-to-any-file:
-            - .devops/**
-            - .github/**
-            - ci/**
-python:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.py"
-            - requirements/**
-            - gguf-py/**
-            - .flake8
-script:
-    - changed-files:
-        - any-glob-to-any-file:
-            - scripts/**
-android:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/llama.android/**
-server:
-    - changed-files:
-        - any-glob-to-any-file:
-            - examples/server/**
-ggml:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/**
-nix:
-    - changed-files:
-        - any-glob-to-any-file:
-            - "**/*.nix"
-            - .github/workflows/nix-*.yml
-            - .devops/nix/nixpkgs-instances.nix
-embedding:
-    - changed-files:
-        - any-glob-to-any-file: examples/embedding/
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
deleted file mode 100644
index d0bdd73c4439c..0000000000000
--- a/.github/pull_request_template.md
+++ /dev/null
@@ -1 +0,0 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled
deleted file mode 100644
index 75d2714792891..0000000000000
--- a/.github/workflows/bench.yml.disabled
+++ /dev/null
@@ -1,304 +0,0 @@
-# TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggml-org/llama.cpp/issues/7893
-#
-# Benchmark
-name: Benchmark
-
-on:
-  workflow_dispatch:
-    inputs:
-      gpu-series:
-        description: 'Azure GPU series to run with'
-        required: true
-        type: choice
-        options:
-          - Standard_NC4as_T4_v3
-          - Standard_NC24ads_A100_v4
-          - Standard_NC80adis_H100_v5
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      duration:
-        description: 'Duration of the bench'
-        type: string
-        default: 10m
-
-  push:
-    branches:
-      - master
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
-  schedule:
-    -  cron: '04 2 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
-  cancel-in-progress: true
-
-jobs:
-  bench-server-baseline:
-    runs-on: Standard_NC4as_T4_v3
-    env:
-      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
-      N_USERS: 8
-      DURATION: 10m
-
-    strategy:
-      matrix:
-        model: [phi-2]
-        ftype: [q4_0, q8_0, f16]
-        include:
-          - model: phi-2
-            ftype: q4_0
-            pr_comment_enabled: "true"
-
-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || github.event_name == 'pull_request_target'
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install python env
-        id: pipenv
-        run: |
-          cd examples/server/bench
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-
-      - name: Prometheus
-        id: install_prometheus
-        run: |
-          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
-          tar xzf prometheus*.tar.gz --strip-components=1
-          ./prometheus --config.file=examples/server/bench/prometheus.yml &
-          while ! nc -z localhost 9090; do
-            sleep 0.1
-          done
-
-      - name: Set up Go
-        uses: actions/setup-go@v5
-        with:
-          go-version: '1.21'
-
-      - name: Install k6 and xk6-sse
-        id: k6_installation
-        run: |
-          cd examples/server/bench
-          go install go.k6.io/xk6/cmd/xk6@latest
-          xk6 build master \
-              --with github.com/phymbert/xk6-sse
-
-      - name: Build
-        id: cmake_build
-        run: |
-          set -eux
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DLLAMA_CUBLAS=ON \
-              -DCUDAToolkit_ROOT=/usr/local/cuda \
-              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
-              -DCMAKE_CUDA_ARCHITECTURES=75 \
-              -DLLAMA_FATAL_WARNINGS=OFF \
-              -DLLAMA_ALL_WARNINGS=OFF \
-              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target llama-server
-
-      - name: Download the dataset
-        id: download_dataset
-        run: |
-          cd examples/server/bench
-          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - name: Server bench
-        id: server_bench
-        env:
-            HEAD_REF: ${{ github.head_ref || github.ref_name }}
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          source venv/bin/activate
-          python bench.py \
-              --runner-label ${{ env.RUNNER_LABEL }} \
-              --name ${{ github.job }} \
-              --branch $HEAD_REF \
-              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
-              --scenario script.js \
-              --duration ${{ github.event.inputs.duration || env.DURATION }} \
-              --hf-repo ggml-org/models	 \
-              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
-              --model-path-prefix /models \
-              --parallel ${{ env.N_USERS }} \
-              -ngl 33 \
-              --batch-size 2048 \
-              --ubatch-size	256 \
-              --ctx-size 16384 \
-              --n-prompts 1000 \
-              --max-prompt-tokens 1024 \
-              --max-tokens 2048
-
-          cat results.github.env >> $GITHUB_ENV
-
-          # Remove dataset as we do not want it in the artefact
-          rm ShareGPT_V3_unfiltered_cleaned_split.json
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          compression-level: 9
-          path: |
-            examples/server/bench/*.jpg
-            examples/server/bench/*.json
-            examples/server/bench/*.log
-
-      - name: Commit status
-        uses: Sibz/github-status-action@v1
-        with:
-          authToken: ${{secrets.GITHUB_TOKEN}}
-          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          description: |
-            ${{ env.BENCH_RESULTS }}
-          state: 'success'
-
-      - name: Upload benchmark images
-        uses: devicons/public-upload-to-imgur@v2.2.2
-        continue-on-error: true # Important as it looks unstable: 503
-        id: imgur_step
-        with:
-          client_id: ${{secrets.IMGUR_CLIENT_ID}}
-          path: |
-            examples/server/bench/prompt_tokens_seconds.jpg
-            examples/server/bench/predicted_tokens_seconds.jpg
-            examples/server/bench/kv_cache_usage_ratio.jpg
-            examples/server/bench/requests_processing.jpg
-
-      - name: Extract mermaid
-        id: set_mermaid
-        run: |
-          set -eux
-
-          cd examples/server/bench
-          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
-          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
-          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
-          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
-          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
-          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
-          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
-          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
-          echo "EOF" >> $GITHUB_ENV
-
-      - name: Extract image url
-        id: extract_image_url
-        continue-on-error: true
-        run: |
-          set -eux
-
-          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
-          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
-          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
-          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
-
-      - name: Comment PR
-        uses: mshick/add-pr-comment@v2
-        id: comment_pr
-        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
-        with:
-          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
-          message: |
-            <p align="center">
-
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
-
-            </p>
-
-            <details>
-
-            <summary>Expand details for performance related PR only</summary>
-
-            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
-            - ${{ env.BENCH_GRAPH_XLABEL }}
-
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
-
-            <details>
-
-            <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PROMPT_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.PREDICTED_TOKENS_SECONDS }}
-            ```
-
-            </details>
-
-            </p>
-
-            <details>
-
-            <summary>Details</summary>
-
-            <p align="center">
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.KV_CACHE_USAGE_RATIO }}
-            ```
-
-            </details>
-
-            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
-
-            <details>
-                <summary>More</summary>
-
-            ```mermaid
-            ${{ env.REQUESTS_PROCESSING }}
-            ```
-
-            </details>
-
-            </p>
-            </details>
-            </details>
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml
deleted file mode 100644
index e8639913ea3a6..0000000000000
--- a/.github/workflows/build-linux-cross.yml
+++ /dev/null
@@ -1,124 +0,0 @@
-name: Build on Linux using cross-compiler
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu-latest-riscv64-cpu-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libcurl4-openssl-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-latest-riscv64-vulkan-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64 \
-                  libcurl4-openssl-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-latest-arm64-vulkan-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Arm64
-        run: |
-          sudo dpkg --add-architecture arm64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64 \
-                  libcurl4-openssl-dev:arm64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index bcfcf08ac30b6..0000000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,1797 +0,0 @@
-name: CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  contents: write # for creating release
-
-env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  macOS-latest-cmake-arm64:
-    runs-on: macos-14
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-arm64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-  macOS-latest-cmake-x64:
-    runs-on: macos-13
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-x64
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install curl
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build \
-            -DCMAKE_BUILD_RPATH="@loader_path" \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-  ubuntu-cpu-cmake:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-22.04-arm
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-cpu-cmake
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L 'main|curl' --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-            -DGGML_OPENMP=OFF
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-llguidance:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DLLAMA_LLGUIDANCE=ON
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-latest-cmake-rpc
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  ubuntu-22-cmake-vulkan:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-vulkan
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VULKAN=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 2700
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-  ubuntu-22-cmake-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-hip
-          evict-old-files: 1d
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Build with legacy HIP support
-        id: cmake_build_legacy_hip
-        run: |
-          cmake -B build2 -S . \
-            -DCMAKE_C_COMPILER=hipcc \
-            -DCMAKE_CXX_COMPILER=hipcc \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGGML_HIP=ON
-          cmake --build build2 --config Release -j $(nproc)
-
-  ubuntu-22-cmake-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libcurl4-openssl-dev
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-musa
-          evict-old-files: 1d
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-cmake-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ubuntu-22-cmake-sycl-fp16
-          evict-old-files: 1d
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          cmake --build build --config Release -j $(nproc)
-
-  build-linux-cross:
-    uses: ./.github/workflows/build-linux-cross.yml
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-ios
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-cmake-tvos
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-cmake-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: macOS-latest-swift
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build llama.cpp with CMake
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-msys2
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            git
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-  windows-latest-cmake:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.309.0
-
-    strategy:
-      matrix:
-        include:
-          - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
-          - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
-          - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
-          - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
-          - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
-          - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
-          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
-          - build: 'llvm-arm64-opencl-adreno'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-${{ matrix.build }}
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Clone Kompute submodule
-        id: clone_kompute
-        if: ${{ matrix.build == 'kompute-x64' }}
-        run: |
-          git submodule update --init ggml/src/ggml-kompute/kompute
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Install Vulkan SDK
-        id: get_vulkan
-        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
-          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
-          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
-          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Check AVX512F support
-        id: check_avx512f
-        if: ${{ matrix.build == 'avx512-x64' }}
-        continue-on-error: true
-        run: |
-          cd build
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
-          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
-          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
-          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
-
-      - name: Test
-        id: cmake_test
-        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      - name: Test (Intel SDE)
-        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-          # for some weird reason windows tar doesn't like sde tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-          cd build
-          $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
-          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
-
-  ubuntu-latest-cmake-cuda:
-    runs-on: ubuntu-latest
-    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
-
-    steps:
-        - name: Clone
-          id: checkout
-          uses: actions/checkout@v4
-          with:
-            fetch-depth: 0
-
-        - name: Install dependencies
-          env:
-            DEBIAN_FRONTEND: noninteractive
-          run: |
-              apt update
-              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
-
-        - name: ccache
-          uses: hendrikmuhs/ccache-action@v1.2.16
-          with:
-            key: ubuntu-latest-cmake-cuda
-            evict-old-files: 1d
-
-        - name: Build with CMake
-          run: |
-            cmake -S . -B build -G Ninja \
-              -DCMAKE_BUILD_TYPE=Release \
-              -DCMAKE_CUDA_ARCHITECTURES=89-real \
-              -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-              -DLLAMA_FATAL_WARNINGS=ON \
-              -DGGML_NATIVE=OFF \
-              -DGGML_CUDA=ON
-            cmake --build build
-
-  windows-2019-cmake-cuda:
-    runs-on: windows-2019
-
-    strategy:
-      matrix:
-        cuda: ['12.4', '11.7']
-        build: ['cuda']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Install Cuda Toolkit 11.7
-        if: ${{ matrix.cuda == '11.7' }}
-        run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-      - name: Install Cuda Toolkit 12.4
-        if: ${{ matrix.cuda == '12.4' }}
-        run: |
-          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          choco install unzip -y
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip"
-          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip"
-          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
-          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-          echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-          cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DLLAMA_BUILD_SERVER=ON ^
-            -DGGML_NATIVE=OFF ^
-            -DGGML_CUDA=ON ^
-            -DGGML_RPC=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
-          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-        run: |
-          echo "Cuda install location: ${{ env.CUDA_PATH }}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-  windows-latest-cmake-sycl:
-    runs-on: windows-latest
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-sycl
-          variant: sccache
-          evict-old-files: 1d
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Build the release package
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
-
-          echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-
-      - name: Upload the release package
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
-          name: llama-bin-win-sycl-x64.zip
-
-  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
-    runs-on: windows-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  # TODO: reuse windows-latest-cmake-hip instead of duplicating this job
-  windows-latest-cmake-hip-release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        gpu_target: [gfx1100, gfx1101, gfx1030]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
-
-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
-        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: windows-latest-cmake-hip-release
-          evict-old-files: 1d
-
-      - name: Install
-        id: depends
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
-          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
-          write-host "Completed AMD HIP SDK installation"
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGGML_HIP=ON `
-            -DGGML_RPC=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-          md "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_CURL=OFF \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework
-
-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: android-build
-          evict-old-files: 1d
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-
-          ./gradlew build --no-daemon
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-cpu-cmake
-      - ubuntu-22-cmake-vulkan
-      - windows-latest-cmake
-      - windows-2019-cmake-cuda
-      - windows-latest-cmake-sycl
-      - windows-latest-cmake-hip-release
-      - macOS-latest-cmake-arm64
-      - macOS-latest-cmake-x64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
-        with:
-          key: release
-          evict-old-files: 1d
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-
-      - name: Move artifacts
-        id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
-
-      - name: Create release
-        id: create_release
-        uses: ggml-org/action-create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.tag.outputs.name }}
-
-      - name: Upload release
-        id: upload_release
-        uses: actions/github-script@v3
-        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
-                });
-              }
-            }
-
-#  ubuntu-latest-gcc:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-clang:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Debug, Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  ubuntu-latest-gcc-sanitized:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#          sudo apt-get install cmake
-#
-#      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
-#
-#      - name: Build
-#        run: |
-#          make
-#
-#  windows:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        include:
-#          - arch: Win32
-#            s2arc: x86
-#          - arch: x64
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Upload binaries
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: llama-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  windows-blas:
-#    runs-on: windows-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#        arch: [Win32, x64]
-#        blas: [ON]
-#        include:
-#          - arch: Win32
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip
-#            s2arc: x86
-#          - arch: x64
-#            obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip
-#            s2arc: x64
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Add msbuild to PATH
-#        uses: microsoft/setup-msbuild@v1
-#
-#      - name: Fetch OpenBLAS
-#        if: matrix.blas == 'ON'
-#        run: |
-#          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-#          7z x blas.zip -oblas -y
-#          copy blas/include/cblas.h .
-#          copy blas/include/openblas_config.h .
-#          echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
-#
-#      - name: Configure
-#        run: >
-#          cmake -S . -B ./build -A ${{ matrix.arch }}
-#          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }}
-#          -DCMAKE_LIBRARY_PATH="$env:blasdir/lib"
-#
-#      - name: Build
-#        run: |
-#          cd ./build
-#          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-#
-#      - name: Copy libopenblas.dll
-#        if: matrix.blas == 'ON'
-#        run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }}
-#
-#      - name: Upload binaries
-#        if: matrix.blas == 'ON'
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: llama-blas-bin-${{ matrix.arch }}
-#          path: build/bin/${{ matrix.build }}
-#
-#  emscripten:
-#    runs-on: ubuntu-latest
-#
-#    strategy:
-#      matrix:
-#        build: [Release]
-#
-#    steps:
-#      - name: Clone
-#        uses: actions/checkout@v4
-#
-#      - name: Dependencies
-#        run: |
-#          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
-#          tar -xvf master.tar.gz
-#          emsdk-master/emsdk update
-#          emsdk-master/emsdk install latest
-#          emsdk-master/emsdk activate latest
-#
-#      - name: Configure
-#        run: echo "tmp"
-#
-#      - name: Build
-#        run: |
-#          pushd emsdk-master
-#          source ./emsdk_env.sh
-#          popd
-#          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-#          make
-
-  openEuler-latest-cmake-cann:
-    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
-    defaults:
-      run:
-       shell: bash -el {0}
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      matrix:
-        cann:
-          - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10'
-        device:
-          - 'ascend910b3'
-        build:
-          - 'Release'
-    container: ascendai/cann:${{ matrix.cann }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=${{ matrix.device }}
-          cmake --build build -j $(nproc)
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
deleted file mode 100644
index 276a217d45005..0000000000000
--- a/.github/workflows/close-issue.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Close inactive issues
-on:
-  schedule:
-    - cron: "42 0 * * *"
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  issues: write
-
-jobs:
-  close-issues:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - uses: actions/stale@v5
-        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
-          days-before-issue-stale: 30
-          days-before-issue-close: 14
-          stale-issue-label: "stale"
-          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
-          days-before-pr-stale: -1
-          days-before-pr-close: -1
-          operations-per-run: 10000
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
deleted file mode 100644
index 9eba3f6a42b5e..0000000000000
--- a/.github/workflows/docker.yml
+++ /dev/null
@@ -1,175 +0,0 @@
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-# GitHub recommends pinning actions to a commit SHA.
-# To get a newer version, you will need to update the SHA.
-# You can also reference a tag or branch, but the action may change without warning.
-
-name: Publish Docker image
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because it is expensive
-    - cron: '12 4 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  packages: write
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-
-    runs-on: ubuntu-22.04
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          # Multi-stage build
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-
-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
-          # list all tags possible
-          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
-              TYPE=""
-          else
-              TYPE="-${{ matrix.config.tag }}"
-          fi
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
-        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-
-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml
deleted file mode 100644
index f02b7c2194bcf..0000000000000
--- a/.github/workflows/editorconfig.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      create_release:
-        description: 'Create new release'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@v2
-        with:
-          version: v3.0.3
-      - run: editorconfig-checker
diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml
deleted file mode 100644
index 3ca4d30581074..0000000000000
--- a/.github/workflows/gguf-publish.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 0b0f300aa402a..0000000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: "Pull Request Labeler"
-on:
-- pull_request_target
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        repository: "ggml-org/llama.cpp"
-    - uses: actions/labeler@v5
-      with:
-        configuration-path: '.github/labeler.yml'
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index 46e80aecd0a0c..0000000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-check-requirements.yml'
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh
diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml
deleted file mode 100644
index ddfdf73b8fce2..0000000000000
--- a/.github/workflows/python-lint.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: flake8 Lint
-
-on:
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            plugins: "flake8-no-print"
diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml
deleted file mode 100644
index 373bb601020b2..0000000000000
--- a/.github/workflows/python-type-check.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Python Type-Check
-
-on:
-  push:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-  pull_request:
-    paths:
-      - '.github/workflows/python-type-check.yml'
-      - 'pyrightconfig.json'
-      - '**.py'
-      - '**/requirements*.txt'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  python-type-check:
-    runs-on: ubuntu-latest
-    name: pyright type-check
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v4
-      - name: Set up Python environment
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Python dependencies
-        # TODO: use a venv
-        run: pip install -r requirements/requirements-all.txt
-      - name: Type-check with Pyright
-        uses: jakebailey/pyright-action@v2
-        with:
-          version: 1.1.382
-          level: warning
-          warnings: true
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
deleted file mode 100644
index 6c9b5132276fe..0000000000000
--- a/.github/workflows/server.yml
+++ /dev/null
@@ -1,237 +0,0 @@
-# Server build and tests
-name: Server
-
-on:
-  workflow_dispatch: # allows manual triggering
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  server:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libcurl4-openssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22.11.0'
-
-      - name: WebUI - Install dependencies
-        id: webui_lint
-        run: |
-          cd examples/server/webui
-          npm ci
-
-      - name: WebUI - Check code format
-        id: webui_format
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd examples/server/webui
-          git status
-
-          npm run format
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Verify bundled index.html
-        id: verify_server_index_html
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd examples/server/webui
-          git status
-
-          npm run build
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
-            exit 1
-          fi
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd examples/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd examples/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd examples/server/tests
-          SLOW_TESTS=1 ./tests.sh
-
-
-  server-windows:
-    runs-on: windows-2019
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
-      - name: Build
-        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
-      - name: Copy Libcurl
-        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
-        run: |
-          cd examples/server/tests
-          $env:PYTHONIOENCODING = ":replace"
-          pytest -v -x -m "not slow"
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd examples/server/tests
-          $env:SLOW_TESTS = "1"
-          pytest -v -x

From b4b90abe597de9bdde0f933f58b3cd25b0a9510d Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 10 Apr 2025 12:27:14 +0200
Subject: [PATCH 006/117] CMakeLists: add the ggml files and include Mesa files

---
 ggml/src/ggml-remotingfrontend/CMakeLists.txt | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 4ab2aaa0ac340..63098a431b0a5 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -12,9 +12,27 @@ cmake_policy(SET CMP0114 NEW)
 message(STATUS "Enable API Remoting frontend found")
 
 ggml_add_backend_library(ggml-remotingfrontend
-                         ggml-remoting-frontend.cpp
+                         ggml-backend-buffer.cpp
+                         ggml-backend.cpp
+                         ggml-backend-device.cpp
+                         ggml-backend-reg.cpp
+                         ggml-buffer-type.cpp
+                         ggml-host-buffer-type.cpp
+                         virtgpu.cpp
                          ../../include/ggml-remoting-frontend.h
                         )
 
-#target_link_libraries(ggml-remotingfrontend PRIVATE remotingfrontend)
+target_link_libraries(ggml-remotingfrontend PUBLIC drm)
+target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/)
+
+set(REMOTING_PROJECT /Users/kevinpouget/remoting)
+set(MESA_PROJECT_HOME ${REMOTING_PROJECT}/mesa)
+set(MESA_PROJECT_SRC ${MESA_PROJECT_HOME}/src)
+
+target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_SRC}/virtio/virtio-gpu/)
+target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_HOME}/include)
 target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+
+target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20)
+
+# dnf install -y libdrm-devel

From 53b42a8a0ea52752a8569689722ebcd9adf9d163 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 10 Apr 2025 12:28:03 +0200
Subject: [PATCH 007/117] ggml-*: move the ggml interfaces to a dedicated file

---
 .../ggml-backend-buffer.cpp                   |  39 ++
 .../ggml-backend-device.cpp                   |  81 +++
 .../ggml-backend-reg.cpp                      |  69 +++
 .../ggml-remotingfrontend/ggml-backend.cpp    |  54 ++
 .../ggml-buffer-type.cpp                      | 158 ++++++
 .../ggml-host-buffer-type.cpp                 |  55 ++
 .../ggml-remoting-frontend.cpp                | 485 +-----------------
 .../src/ggml-remotingfrontend/ggml-remoting.h |  61 +++
 8 files changed, 523 insertions(+), 479 deletions(-)
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-remoting.h

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
new file mode 100644
index 0000000000000..638203252a86d
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -0,0 +1,39 @@
+#include <memory>
+
+#include "ggml-remoting.h"
+
+void ggml_remoting_destroy_buffer(remoting_buffer& buf) {
+    UNUSED(buf);
+}
+
+static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) {
+    UNUSED(dst);
+    UNUSED(offset);
+    UNUSED(src);
+    UNUSED(size);
+}
+
+static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) {
+    UNUSED(src);
+    UNUSED(offset);
+    UNUSED(dst);
+    UNUSED(size);
+}
+
+static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) {
+  UNUSED(ctx);
+  UNUSED(dst);
+  UNUSED(dst_offset);
+  UNUSED(src);
+  UNUSED(src_offset);
+  UNUSED(size);
+}
+
+static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+
+static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) {
+    if (tensor->view_src) {
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
+    }
+    return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
new file mode 100644
index 0000000000000..b18ce03a37121
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -0,0 +1,81 @@
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return "API Remoting";
+}
+
+static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return "API Remoting device";
+}
+
+static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
+    UNUSED(device);
+    *total = 1024*1024*1024;
+    *free = *total;
+}
+
+static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+  UNUSED(dev);
+  UNUSED(op);
+
+  return true;
+}
+
+static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    UNUSED(dev);
+    UNUSED(buft);
+    return true;
+}
+
+static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+
+    UNUSED(dev);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    UNUSED(dev);
+    return ggml_backend_remoting_host_buffer_type();
+}
+
+
+static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_remoting_device_get_name(dev);
+    props->description = ggml_backend_remoting_device_get_description(dev);
+    props->type        = ggml_backend_remoting_device_get_type(dev);
+    ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ true,
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
+    /* .get_name             = */ ggml_backend_remoting_device_get_name,
+    /* .get_description      = */ ggml_backend_remoting_device_get_description,
+    /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+    /* .get_type             = */ ggml_backend_remoting_device_get_type,
+    /* .get_props            = */ ggml_backend_remoting_device_get_props,
+    /* .init_backend         = */ ggml_backend_remoting_device_init,
+    /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+    /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+    /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
new file mode 100644
index 0000000000000..00dddf23f2898
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -0,0 +1,69 @@
+#include <mutex>
+#include <iostream>
+
+#include "ggml-remoting.h"
+
+static int ggml_backend_remoting_get_device_count() {
+    return 1;
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return ggml_backend_remoting_get_device_count();
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+
+	    create_virtgpu();
+
+            for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
+                ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
+                char desc[256] = "API Remoting device";
+
+                ctx->device = i;
+                ctx->name = GGML_REMOTING_NAME + std::to_string(i);
+                ctx->description = desc;
+                devices.push_back(new ggml_backend_device {
+                    /* .iface   = */ ggml_backend_remoting_device_i,
+                    /* .reg     = */ reg,
+                    /* .context = */ ctx,
+                });
+            }
+            initialized = true;
+        }
+    }
+
+    GGML_ASSERT(device < devices.size());
+    return devices[device];
+}
+
+static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+    UNUSED(reg);
+    return GGML_REMOTING_NAME;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_remoting_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_remoting_reg_i,
+        /* .context     = */ nullptr,
+    };
+
+    RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
+    return &reg;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
new file mode 100644
index 0000000000000..2618e48929cba
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -0,0 +1,54 @@
+#include "ggml-remoting.h"
+
+static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
+    UNUSED(backend);
+
+    return "API Remoting backend";
+}
+
+static void ggml_backend_remoting_free(ggml_backend_t backend) {
+    UNUSED(backend);
+}
+
+static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    UNUSED(backend);
+    UNUSED(cgraph);
+
+    return GGML_STATUS_SUCCESS;
+}
+
+static ggml_backend_i ggml_backend_remoting_interface = {
+    /* .get_name                = */ ggml_backend_remoting_get_name,
+    /* .free                    = */ ggml_backend_remoting_free,
+    /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+    /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+    /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_remoting_guid() {
+    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+    return &guid;
+}
+
+
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
+    UNUSED(params);
+    ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
+
+    ggml_backend_t remoting_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_remoting_guid(),
+        /* .interface = */ ggml_backend_remoting_interface,
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device),
+        /* .context   = */ ctx,
+    };
+
+    return remoting_backend;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
new file mode 100644
index 0000000000000..3d882110b9962
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
@@ -0,0 +1,158 @@
+#include "ggml-remoting.h"
+
+extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+
+struct ggml_backend_remoting_buffer_type_context {
+    std::string name;
+};
+
+
+static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+
+    return "Remoting buffer";
+}
+
+static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
+
+
+    return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+    return 4096;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    UNUSED(buft);
+    return 40960;
+}
+
+static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    UNUSED(buft);
+    UNUSED(tensor);
+    return ggml_nbytes(tensor);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+    /* .is_host          = */ NULL,
+};
+
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+
+    static struct ggml_backend_buffer_type buft {
+      /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+      /* .device   = */ dev,
+      /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
+    };
+
+    return & buft;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+    ggml_remoting_destroy_buffer(ctx->dev_buffer);
+    delete ctx;
+}
+
+static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    if (tensor->view_src != nullptr) {
+        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+    }
+    return GGML_STATUS_SUCCESS;
+}
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return (void *) 4096;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(value);
+  UNUSED(offset);
+  UNUSED(size);
+}
+
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+#if 0
+    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+    remoting_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+#else
+    UNUSED(buffer);
+    UNUSED(tensor);
+    UNUSED(data);
+    UNUSED(offset);
+    UNUSED(size);
+#endif
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+#if 0
+    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+
+    remoting_buffer buf = buf_ctx->dev_buffer;
+
+    ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+#else
+    UNUSED(buffer);
+    UNUSED(tensor);
+    UNUSED(data);
+    UNUSED(offset);
+    UNUSED(size);
+#endif
+}
+
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  return true;
+
+  UNUSED(buffer);
+  UNUSED(src);
+  UNUSED(dst);
+}
+
+static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  UNUSED(dst);
+  UNUSED(c);
+  UNUSED(size);
+  UNUSED(offset);
+}
+
+static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  UNUSED(ctx);
+  UNUSED(dst);
+  UNUSED(c);
+  UNUSED(size);
+  UNUSED(offset);
+}
+
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+
+    ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
+}
+
+ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_remoting_buffer_clear,
+    /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
new file mode 100644
index 0000000000000..b40c72b8d1e8b
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
@@ -0,0 +1,55 @@
+#include "ggml-remoting.h"
+
+// host buffer type
+
+static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_REMOTING_NAME "_Host";
+
+    UNUSED(buft);
+}
+
+static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+# if 0
+    ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
+#endif
+    UNUSED(buffer);
+}
+
+static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+
+    void *ptr = nullptr;
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
+
+    return buffer;
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+  return 4096;
+}
+
+// Should be changed to return device-specific host buffer type
+// but that probably requires changes in llama.cpp
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0),
+        /* .context  = */ nullptr,
+    };
+
+    // Make sure device 0 is initialized
+    //ggml_remoting_instance_init();
+    //ggml_remoting_get_device(0);
+
+    return &ggml_backend_remoting_buffer_type_host;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
index 4c7c1f1dc8f95..87679fe59a8d3 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp
@@ -1,499 +1,26 @@
-#include "ggml-remoting-frontend.h"
-
 #include <ostream>
 #include <iostream>
 #include <mutex>
 #include <memory>
 #include <chrono>
 #include <thread>
+#include <unistd.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+
+#include "ggml-remoting-frontend.h"
+#include "remoting.h"
 
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
-#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
-#define UNUSED GGML_UNUSED
 
 int ggml_backend_remoting_get_device_count();
-ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
-
-static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
-
-
-struct ggml_backend_remoting_buffer_type_context {
-    std::string name;
-};
-
-struct remoting_context_struct {
-   int i;
-};
-typedef std::shared_ptr<remoting_context_struct> remoting_context;
-typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
-
-static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
-    UNUSED(reg);
-    return ggml_backend_remoting_get_device_count();
-}
-
-static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
-    UNUSED(reg);
-    return GGML_REMOTING_NAME;
-}
-
-struct ggml_backend_remoting_device_context {
-    size_t device;
-    std::string name;
-    std::string description;
-};
-
-static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return "API Remoting";
-}
 
-static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return "API Remoting device";
-}
 
-static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
-}
 
-static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-    UNUSED(device);
-    *total = 1024*1024*1024;
-    *free = *total;
-}
 
 struct remoting_device_struct {
     std::mutex mutex;
 };
-
-struct remoting_device_struct;
-typedef std::shared_ptr<remoting_device_struct> remoting_device;
-typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
-
-struct remoting_buffer_struct;
-typedef std::shared_ptr<remoting_buffer_struct> remoting_buffer;
-typedef std::weak_ptr<remoting_buffer_struct> remoting_buffer_ref;
-
-// vk buffer type
-static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
-
-    return "Remoting buffer";
-}
-
-static void ggml_remoting_destroy_buffer(remoting_buffer& buf) {
-    UNUSED(buf);
-}
-
-
-static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
-static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  UNUSED(ctx);
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
-
-static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) {
-    if (tensor->view_src) {
-        return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
-    }
-    return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
-}
-
-struct ggml_backend_remoting_buffer_context {
-    remoting_device_ref device;
-    remoting_buffer dev_buffer;
-    std::string name;
-
-    ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
-        name(name) {
-        UNUSED(device);
-	UNUSED(dev_buffer);
-    }
-
-    ~ggml_backend_remoting_buffer_context() {
-        ggml_remoting_destroy_buffer(dev_buffer);
-    }
-};
-
-static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-    ggml_remoting_destroy_buffer(ctx->dev_buffer);
-    delete ctx;
-}
-
-static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *) 4096;
-
-    UNUSED(buffer);
-}
-
-static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != nullptr) {
-        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-    }
-    return GGML_STATUS_SUCCESS;
-}
-
-static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(value);
-  UNUSED(offset);
-  UNUSED(size);
-}
-
-static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) {
-    UNUSED(dst);
-    UNUSED(offset);
-    UNUSED(src);
-    UNUSED(size);
-}
-
-static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) {
-    UNUSED(src);
-    UNUSED(offset);
-    UNUSED(dst);
-    UNUSED(size);
-}
-
-static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-#if 0
-    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-    remoting_buffer buf = buf_ctx->dev_buffer;
-
-    ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
-#else
-    UNUSED(buffer);
-    UNUSED(tensor);
-    UNUSED(data);
-    UNUSED(offset);
-    UNUSED(size);
-#endif
-}
-
-static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-#if 0
-    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-
-    remoting_buffer buf = buf_ctx->dev_buffer;
-
-    ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
-#else
-    UNUSED(buffer);
-    UNUSED(tensor);
-    UNUSED(data);
-    UNUSED(offset);
-    UNUSED(size);
-#endif
-}
-
-static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) {
-  UNUSED(ctx);
-  UNUSED(dst);
-  UNUSED(dst_offset);
-  UNUSED(src);
-  UNUSED(src_offset);
-  UNUSED(size);
-}
-
-static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-  return true;
-
-  UNUSED(buffer);
-  UNUSED(src);
-  UNUSED(dst);
-}
-
-static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-
-    ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
-}
-
-static ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
-    /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_remoting_buffer_clear,
-    /* .reset           = */ NULL,
-};
-
-static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
-
-
-    return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
-    return 4096;
-}
-
-static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
-    return 40960;
-}
-
-static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    UNUSED(buft);
-    UNUSED(tensor);
-    return ggml_nbytes(tensor);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
-    /* .is_host          = */ NULL,
-};
-
-static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
-
-    static struct ggml_backend_buffer_type buft {
-      /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
-      /* .device   = */ dev,
-      /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
-    };
-
-    return & buft;
-}
-
-static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  UNUSED(dev);
-  UNUSED(op);
-
-  return true;
-}
-
-static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    UNUSED(dev);
-    UNUSED(buft);
-    return true;
-}
-
-
-static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
-
-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
-
-    UNUSED(dev);
-}
-
-static const char * ggml_backend_remoting_name(ggml_backend_t backend) {
-    UNUSED(backend);
-
-    return "API Remoting backend";
-}
-
-static void ggml_backend_remoting_free(ggml_backend_t backend) {
-    UNUSED(backend);
-}
-
-static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    UNUSED(backend);
-    UNUSED(cgraph);
-
-    return GGML_STATUS_SUCCESS;
-}
-
-static ggml_backend_i ggml_backend_remoting_interface = {
-    /* .get_name                = */ ggml_backend_remoting_name,
-    /* .free                    = */ ggml_backend_remoting_free,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
-    /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
-    /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_remoting_device_get_name(dev);
-    props->description = ggml_backend_remoting_device_get_description(dev);
-    props->type        = ggml_backend_remoting_device_get_type(dev);
-    ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ true,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
-}
-
-static ggml_guid_t ggml_backend_remoting_guid() {
-    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-    return &guid;
-}
-
-
-static ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
-    UNUSED(params);
-    ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
-
-    ggml_backend_t remoting_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_remoting_guid(),
-        /* .interface = */ ggml_backend_remoting_interface,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device),
-        /* .context   = */ ctx,
-    };
-
-    return remoting_backend;
-}
-
-// host buffer type
-
-static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_REMOTING_NAME "_Host";
-
-    UNUSED(buft);
-}
-
-static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-# if 0
-    ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
-#endif
-    UNUSED(buffer);
-}
-
-static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-
-    void *ptr = nullptr;
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
-
-    return buffer;
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
-  return 4096;
-}
-
-// Should be changed to return device-specific host buffer type
-// but that probably requires changes in llama.cpp
-ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0),
-        /* .context  = */ nullptr,
-    };
-
-    // Make sure device 0 is initialized
-    //ggml_remoting_instance_init();
-    //ggml_remoting_get_device(0);
-
-    return &ggml_backend_remoting_buffer_type_host;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return ggml_backend_remoting_host_buffer_type();
-}
-
-static const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
-    /* .get_name             = */ ggml_backend_remoting_device_get_name,
-    /* .get_description      = */ ggml_backend_remoting_device_get_description,
-    /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
-    /* .get_type             = */ ggml_backend_remoting_device_get_type,
-    /* .get_props            = */ ggml_backend_remoting_device_get_props,
-    /* .init_backend         = */ ggml_backend_remoting_device_init,
-    /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-    /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
-    /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
-};
-
-static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-    static std::vector<ggml_backend_dev_t> devices;
-
-    static bool initialized = false;
-
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
-            for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
-                ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
-                char desc[256] = "API Remoting device";
-
-                ctx->device = i;
-                ctx->name = GGML_REMOTING_NAME + std::to_string(i);
-                ctx->description = desc;
-                devices.push_back(new ggml_backend_device {
-                    /* .iface   = */ ggml_backend_remoting_device_i,
-                    /* .reg     = */ reg,
-                    /* .context = */ ctx,
-                });
-            }
-            initialized = true;
-        }
-    }
-
-    GGML_ASSERT(device < devices.size());
-    return devices[device];
-}
-
-int ggml_backend_remoting_get_device_count() {
-    return 1;
-}
-
-static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
-    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
-    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
-    /* .get_proc_address = */ NULL,
-};
-
-ggml_backend_reg_t ggml_backend_remoting_reg() {
-    static ggml_backend_reg reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_remoting_reg_i,
-        /* .context     = */ nullptr,
-    };
-
-    RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
-    return &reg;
-}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
new file mode 100644
index 0000000000000..c6acdf6cfe1c8
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <string>
+#include <memory>
+
+#include "ggml-remoting-frontend.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "virtgpu.h"
+
+#define UNUSED GGML_UNUSED
+
+#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
+
+struct ggml_backend_remoting_device_context {
+    size_t device;
+    std::string name;
+    std::string description;
+};
+
+extern const struct ggml_backend_device_i ggml_backend_remoting_device_i;
+
+ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);
+ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
+
+struct remoting_buffer_struct;
+typedef std::shared_ptr<remoting_buffer_struct> remoting_buffer;
+typedef std::weak_ptr<remoting_buffer_struct> remoting_buffer_ref;
+
+void ggml_remoting_destroy_buffer(remoting_buffer& buf);
+
+struct remoting_device_struct;
+typedef std::shared_ptr<remoting_device_struct> remoting_device;
+typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
+
+struct ggml_backend_remoting_buffer_context {
+    remoting_device_ref device;
+    remoting_buffer dev_buffer;
+    std::string name;
+
+    ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
+        name(name) {
+        UNUSED(device);
+	UNUSED(dev_buffer);
+    }
+
+    ~ggml_backend_remoting_buffer_context() {
+        ggml_remoting_destroy_buffer(dev_buffer);
+    }
+};
+
+
+struct remoting_context_struct {
+   int i;
+};
+typedef std::shared_ptr<remoting_context_struct> remoting_context;
+typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;

From 5049b2fe4b0c6d39ad99d677addd5919e79ec7f1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 10 Apr 2025 12:28:17 +0200
Subject: [PATCH 008/117] run.vulkan.sh: allow running with GDB

---
 run.vulkan.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/run.vulkan.sh b/run.vulkan.sh
index 7f44334290bbf..1cd38ea58ef52 100755
--- a/run.vulkan.sh
+++ b/run.vulkan.sh
@@ -1 +1,10 @@
-../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing"
+#! /bin/bash
+
+if [[ ${1:-} == "gdb" ]]; then
+    prefix="gdb --args"
+else
+    prefix=""
+fi
+
+export VN_DEBUG=init
+$prefix ../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing"

From ffa659f27d76d22a4896b33dac3bc7b80dc76b1e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 10 Apr 2025 12:28:37 +0200
Subject: [PATCH 009/117] virtgpu: start integrating virt-gpu code

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 330 +++++++++++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.h   | 171 +++++++++++
 2 files changed, 501 insertions(+)
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu.h

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
new file mode 100644
index 0000000000000..f73be2767527d
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -0,0 +1,330 @@
+#include <stdio.h>
+#include <cassert>
+#include <cerrno>
+
+#include "virtgpu.h"
+
+static inline void
+virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
+{
+   /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory.  They are
+    * logically contiguous in the guest but are sglists (iovecs) in the host.
+    * That makes them slower to process in the host.  With host process
+    * isolation, it also becomes impossible for the host to access sglists
+    * directly.
+    *
+    * While there are ideas (and shipped code in some cases) such as creating
+    * udmabufs from sglists, or having a dedicated guest heap, it seems the
+    * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D.  That is, when the
+    * renderer sees a request to export a blob where
+    *
+    *  - blob_mem is VIRTGPU_BLOB_MEM_HOST3D
+    *  - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE
+    *  - blob_id is 0
+    *
+    * it allocates a host shmem.
+    *
+    * supports_blob_id_0 has been enforced by mandated render server config.
+    */
+   assert(gpu->capset.data.supports_blob_id_0);
+   gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+}
+
+void
+create_virtgpu() {
+  struct virtgpu *gpu = new struct virtgpu();
+
+  VkResult result = virtgpu_open(gpu);
+  GGML_ASSERT(result == VK_SUCCESS);
+
+  result = virtgpu_init_params(gpu);
+  GGML_ASSERT(result == VK_SUCCESS);
+
+  result = virtgpu_init_capset(gpu);
+  GGML_ASSERT(result == VK_SUCCESS);
+
+  result = virtgpu_init_context(gpu);
+  GGML_ASSERT(result == VK_SUCCESS);
+
+  virtgpu_init_shmem_blob_mem(gpu);
+}
+
+static VkResult
+virtgpu_open(struct virtgpu *gpu)
+{
+   drmDevicePtr devs[8];
+   int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
+   if (count < 0) {
+     INFO("failed to enumerate DRM devices");
+     return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   VkResult result = VK_ERROR_INITIALIZATION_FAILED;
+   for (int i = 0; i < count; i++) {
+      result = virtgpu_open_device(gpu, devs[i]);
+      if (result == VK_SUCCESS)
+         break;
+   }
+
+   drmFreeDevices(devs, count);
+
+   return result;
+}
+
+static VkResult
+virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
+{
+   bool supported_bus = false;
+
+   switch (dev->bustype) {
+   case DRM_BUS_PCI:
+      if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID &&
+          dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID)
+         supported_bus = true;
+      break;
+   case DRM_BUS_PLATFORM:
+      supported_bus = true;
+      break;
+   default:
+      break;
+   }
+
+   if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) {
+      if (VN_DEBUG(INIT)) {
+         const char *name = "unknown";
+         for (uint32_t i = 0; i < DRM_NODE_MAX; i++) {
+            if (dev->available_nodes & (1 << i)) {
+               name = dev->nodes[i];
+               break;
+            }
+         }
+         vn_log(gpu->instance, "skipping DRM device %s", name);
+      }
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   const char *primary_path = dev->nodes[DRM_NODE_PRIMARY];
+   const char *node_path = dev->nodes[DRM_NODE_RENDER];
+
+   int fd = open(node_path, O_RDWR | O_CLOEXEC);
+   if (fd < 0) {
+      if (VN_DEBUG(INIT))
+         vn_log(gpu->instance, "failed to open %s", node_path);
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   drmVersionPtr version = drmGetVersion(fd);
+   if (!version || strcmp(version->name, "virtio_gpu") ||
+       version->version_major != 0) {
+      if (VN_DEBUG(INIT)) {
+         if (version) {
+            vn_log(gpu->instance, "unknown DRM driver %s version %d",
+                   version->name, version->version_major);
+         } else {
+            vn_log(gpu->instance, "failed to get DRM driver version");
+         }
+      }
+      if (version)
+         drmFreeVersion(version);
+      close(fd);
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   gpu->fd = fd;
+
+   struct stat st;
+   if (stat(primary_path, &st) == 0) {
+      gpu->has_primary = true;
+      gpu->primary_major = major(st.st_rdev);
+      gpu->primary_minor = minor(st.st_rdev);
+   } else {
+      gpu->has_primary = false;
+      gpu->primary_major = 0;
+      gpu->primary_minor = 0;
+   }
+   stat(node_path, &st);
+   gpu->render_major = major(st.st_rdev);
+   gpu->render_minor = minor(st.st_rdev);
+
+   gpu->bustype = dev->bustype;
+   if (dev->bustype == DRM_BUS_PCI)
+      gpu->pci_bus_info = *dev->businfo.pci;
+
+   drmFreeVersion(version);
+
+   if (VN_DEBUG(INIT))
+      vn_log(gpu->instance, "using DRM device %s", node_path);
+
+   return VK_SUCCESS;
+}
+
+void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+{
+   if (instance) {
+     printf("<INST>");
+   }
+
+   va_list ap;
+
+   va_start(ap, format);
+   vprintf(format, ap);
+   va_end(ap);
+
+   /* instance may be NULL or partially initialized */
+}
+
+
+
+static VkResult
+virtgpu_init_context(struct virtgpu *gpu)
+{
+   assert(!gpu->capset.version);
+   const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id);
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to initialize context: %s",
+                strerror(errno));
+      }
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+virtgpu_init_capset(struct virtgpu *gpu)
+{
+   gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS;
+   gpu->capset.version = 0;
+
+   const int ret =
+      virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version,
+                             &gpu->capset.data, sizeof(gpu->capset.data));
+   if (ret) {
+      if (VN_DEBUG(INIT)) {
+         vn_log(gpu->instance, "failed to get venus v%d capset: %s",
+                gpu->capset.version, strerror(errno));
+      }
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+virtgpu_init_params(struct virtgpu *gpu)
+{
+   const uint64_t required_params[] = {
+      VIRTGPU_PARAM_3D_FEATURES,   VIRTGPU_PARAM_CAPSET_QUERY_FIX,
+      VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT,
+   };
+   uint64_t val;
+   for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) {
+      val = virtgpu_ioctl_getparam(gpu, required_params[i]);
+      if (!val) {
+         if (VN_DEBUG(INIT)) {
+            vn_log(gpu->instance, "required kernel param %d is missing",
+                   (int)required_params[i]);
+         }
+         return VK_ERROR_INITIALIZATION_FAILED;
+      }
+   }
+
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE);
+   if (val) {
+      gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
+   } else {
+      val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM);
+      if (val) {
+         gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM;
+      }
+   }
+
+   if (!val) {
+      vn_log(gpu->instance,
+             "one of required kernel params (%d or %d) is missing",
+             (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM);
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   /* Cross-device feature is optional.  It enables sharing dma-bufs
+    * with other virtio devices, like virtio-wl or virtio-video used
+    * by ChromeOS VMs.  Qemu doesn't support cross-device sharing.
+    */
+   val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE);
+   if (val)
+      gpu->supports_cross_device = true;
+
+   /* implied by CONTEXT_INIT uapi */
+   gpu->max_timeline_count = 64;
+
+   return VK_SUCCESS;
+}
+
+
+static int
+virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
+{
+   return drmIoctl(gpu->fd, request, args);
+}
+
+static int
+virtgpu_ioctl_context_init(struct virtgpu *gpu,
+                           enum virgl_renderer_capset capset_id)
+{
+   struct drm_virtgpu_context_set_param ctx_set_params[3] = {
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID,
+         .value = capset_id,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS,
+         .value = 64,
+      },
+      {
+         .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK,
+         .value = 0, /* don't generate drm_events on fence signaling */
+      },
+   };
+
+   struct drm_virtgpu_context_init args = {
+      .num_params = ARRAY_SIZE(ctx_set_params),
+      .pad = 0,
+      .ctx_set_params = (uintptr_t)&ctx_set_params,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args);
+}
+
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size)
+{
+   struct drm_virtgpu_get_caps args = {
+      .cap_set_id = id,
+      .cap_set_ver = version,
+      .addr = (uintptr_t)capset,
+      .size = (__u32) capset_size,
+      .pad = 0,
+   };
+
+   return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+static uint64_t
+virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
+{
+   /* val must be zeroed because kernel only writes the lower 32 bits */
+   uint64_t val = 0;
+   struct drm_virtgpu_getparam args = {
+      .param = param,
+      .value = (uintptr_t)&val,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
+   return ret ? 0 : val;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
new file mode 100644
index 0000000000000..618fc5dc6e3b6
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <xf86drm.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <threads.h>
+#include <cstring>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include "ggml-remoting-frontend.h"
+#define VIRGL_RENDERER_UNSTABLE_APIS 1
+#include "drm-uapi/virtgpu_drm.h"
+#include "virglrenderer_hw.h"
+#include "venus_hw.h"
+
+/* from src/virtio/vulkan/vn_renderer_virtgpu.c */
+#define VIRTGPU_PCI_VENDOR_ID 0x1af4
+#define VIRTGPU_PCI_DEVICE_ID 0x1050
+#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004
+#define VIRTGPU_PARAM_GUEST_VRAM 9
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define VN_DEBUG(what) true
+
+typedef enum VkResult {
+    VK_SUCCESS = 0,
+    VK_NOT_READY = 1,
+    VK_TIMEOUT = 2,
+    VK_EVENT_SET = 3,
+    VK_EVENT_RESET = 4,
+    VK_INCOMPLETE = 5,
+    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
+    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
+    VK_ERROR_INITIALIZATION_FAILED = -3,
+    VK_ERROR_DEVICE_LOST = -4,
+    VK_ERROR_MEMORY_MAP_FAILED = -5,
+    VK_ERROR_LAYER_NOT_PRESENT = -6,
+    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
+    VK_ERROR_FEATURE_NOT_PRESENT = -8,
+    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
+    VK_ERROR_TOO_MANY_OBJECTS = -10,
+    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
+    VK_ERROR_FRAGMENTED_POOL = -12,
+    VK_ERROR_UNKNOWN = -13,
+    VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000,
+    VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003,
+    VK_ERROR_FRAGMENTATION = -1000161000,
+    VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000,
+    VK_PIPELINE_COMPILE_REQUIRED = 1000297000,
+    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
+    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001,
+    VK_SUBOPTIMAL_KHR = 1000001003,
+    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
+    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
+    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
+    VK_ERROR_INVALID_SHADER_NV = -1000012000,
+    VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR = -1000023000,
+    VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR = -1000023001,
+    VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR = -1000023002,
+    VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR = -1000023003,
+    VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR = -1000023004,
+    VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR = -1000023005,
+    VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158000,
+    VK_ERROR_NOT_PERMITTED_KHR = -1000174001,
+    VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000,
+    VK_THREAD_IDLE_KHR = 1000268000,
+    VK_THREAD_DONE_KHR = 1000268001,
+    VK_OPERATION_DEFERRED_KHR = 1000268002,
+    VK_OPERATION_NOT_DEFERRED_KHR = 1000268003,
+    VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR = -1000299000,
+    VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000,
+    VK_INCOMPATIBLE_SHADER_BINARY_EXT = 1000482000,
+    VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY,
+    VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE,
+    VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION,
+    VK_ERROR_NOT_PERMITTED_EXT = VK_ERROR_NOT_PERMITTED_KHR,
+    VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
+    VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS_KHR = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
+    VK_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED,
+    VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED,
+    VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT = VK_INCOMPATIBLE_SHADER_BINARY_EXT,
+    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
+} VkResult;
+
+
+struct remoting_dev_instance {
+  int yes;
+};
+
+#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+
+inline void
+vn_log(struct remoting_dev_instance *instance, const char *format, ...)
+   PRINTFLIKE(2, 3);
+
+
+inline void
+INFO(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+
+struct virtgpu {
+   //struct vn_renderer base;
+
+   struct remoting_dev_instance *instance;
+
+   int fd;
+
+   bool has_primary;
+   int primary_major;
+   int primary_minor;
+   int render_major;
+   int render_minor;
+
+   int bustype;
+   drmPciBusInfo pci_bus_info;
+
+   uint32_t max_timeline_count;
+
+   struct {
+      enum virgl_renderer_capset id;
+      uint32_t version;
+      struct virgl_renderer_capset_venus data;
+   } capset;
+
+   uint32_t shmem_blob_mem;
+   uint32_t bo_blob_mem;
+
+   /* note that we use gem_handle instead of res_id to index because
+    * res_id is monotonically increasing by default (see
+    * virtio_gpu_resource_id_get)
+    */
+  //struct util_sparse_array shmem_array;
+  // struct util_sparse_array bo_array;
+
+   mtx_t dma_buf_import_mutex;
+
+//   struct vn_renderer_shmem_cache shmem_cache;
+
+   bool supports_cross_device;
+};
+
+
+void create_virtgpu();
+static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
+static VkResult virtgpu_open(struct virtgpu *gpu);
+
+
+static VkResult virtgpu_init_params(struct virtgpu *gpu);
+static VkResult virtgpu_init_capset(struct virtgpu *gpu);
+static VkResult virtgpu_init_context(struct virtgpu *gpu);
+
+static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
+				      enum virgl_renderer_capset capset_id);
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size);
+static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
+static void virtgpu_init_renderer_info(struct virtgpu *gpu);

From 3ba78a5d3337eb93c0d1500f5bd4e071ea30094e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 10 Apr 2025 16:49:40 +0200
Subject: [PATCH 010/117] virtgpu: allocate a shared page with the host

---
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   2 +
 .../src/ggml-remotingfrontend/virtgpu-shm.cpp | 107 +++++++++
 ggml/src/ggml-remotingfrontend/virtgpu-shm.h  |  37 ++++
 .../ggml-remotingfrontend/virtgpu-utils.cpp   | 186 ++++++++++++++++
 .../src/ggml-remotingfrontend/virtgpu-utils.h |  50 +++++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    |  35 ++-
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  20 +-
 .../src/ggml-remotingfrontend/virtgpu_venus.c | 209 ++++++++++++++++++
 8 files changed, 628 insertions(+), 18 deletions(-)
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-shm.h
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-utils.h
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu_venus.c

diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 63098a431b0a5..778fddd89a164 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -19,6 +19,8 @@ ggml_add_backend_library(ggml-remotingfrontend
                          ggml-buffer-type.cpp
                          ggml-host-buffer-type.cpp
                          virtgpu.cpp
+                         virtgpu-shm.cpp
+                         virtgpu-utils.cpp
                          ../../include/ggml-remoting-frontend.h
                         )
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
new file mode 100644
index 0000000000000..f027860407a4e
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -0,0 +1,107 @@
+#include <assert.h>
+
+#include "virtgpu-shm.h"
+
+static uint32_t
+virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu,
+                                   uint32_t blob_mem,
+                                   uint32_t blob_flags,
+                                   size_t blob_size,
+                                   uint64_t blob_id,
+                                   uint32_t *res_id)
+{
+#ifdef SIMULATE_BO_SIZE_FIX
+   blob_size = align64(blob_size, 4096);
+#endif
+
+   struct drm_virtgpu_resource_create_blob args = {
+      .blob_mem = blob_mem,
+      .blob_flags = blob_flags,
+      .bo_handle = 0,
+      .res_handle = 0,
+      .size = blob_size,
+      .pad = 0,
+      .cmd_size = 0,
+      .cmd = 0,
+      .blob_id = blob_id,
+   };
+
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args))
+      return 0;
+
+   *res_id = args.res_handle;
+   return args.bo_handle;
+}
+
+static void
+virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle)
+{
+   struct drm_gem_close args = {
+      .handle = gem_handle,
+      .pad = 0,
+   };
+
+   const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
+   assert(!ret);
+}
+
+static void *
+virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
+{
+   struct drm_virtgpu_map args = {
+      .offset = 0,
+      .handle = gem_handle,
+      .pad = 0,
+   };
+   printf("Say hello world\n");
+   if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
+      return NULL;
+
+   void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd,
+                    args.offset);
+   if (ptr == MAP_FAILED)
+      return NULL;
+
+   return ptr;
+}
+
+void
+virtgpu_shmem_destroy(struct virtgpu *gpu,
+                      struct virtgpu_shmem *shmem)
+{
+   munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
+   virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+}
+
+struct vn_renderer_shmem *
+virtgpu_shmem_create(struct virtgpu *gpu, size_t size)
+{
+   size = align64(size, 16384);
+
+   uint32_t res_id;
+   uint32_t gem_handle = virtgpu_ioctl_resource_create_blob(
+      gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0,
+      &res_id);
+   if (!gem_handle)
+      return NULL;
+
+   void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size);
+   if (!ptr) {
+      virtgpu_ioctl_gem_close(gpu, gem_handle);
+      return NULL;
+   }
+   if (gpu->shmem_array.elem_size == 0) {
+     INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n");
+     assert(false);
+   }
+   struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle);
+
+   shmem->gem_handle = gem_handle;
+   shmem->base.res_id = res_id;
+   shmem->base.mmap_size = size;
+   shmem->base.mmap_ptr = ptr;
+   shmem->base.refcount.count = 1;
+   shmem->base.gem_handle = gem_handle;
+
+   return &shmem->base;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
new file mode 100644
index 0000000000000..3bdc5ca700f1b
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstddef>
+#include <stdatomic.h>
+#include <sys/mman.h>
+
+#include "virtgpu.h"
+#include "virtgpu-utils.h"
+
+struct vn_refcount {
+   int count; //atomic_int
+};
+
+
+struct vn_renderer_shmem {
+   struct vn_refcount refcount;
+
+   uint32_t res_id;
+   size_t mmap_size; /* for internal use only (i.e., munmap) */
+   void *mmap_ptr;
+
+   struct list_head cache_head;
+   int64_t cache_timestamp;
+
+   uint32_t gem_handle;
+};
+
+struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size);
+void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem);
+
+
+struct virtgpu_shmem {
+   struct vn_renderer_shmem base;
+   uint32_t gem_handle;
+};
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
new file mode 100644
index 0000000000000..100f495add1bc
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -0,0 +1,186 @@
+#include "virtgpu-utils.h"
+#include <malloc.h>
+#include <cstring>
+#include <stdlib.h>
+
+#define NODE_ALLOC_ALIGN 64
+#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1))
+#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
+#define NULL_NODE 0
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) free(_ptr)
+#define p_atomic_cmpxchg(v, old, _new) \
+   __sync_val_compare_and_swap((v), (old), (_new))
+
+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
+void
+util_sparse_array_init(struct util_sparse_array *arr,
+                       size_t elem_size, size_t node_size)
+{
+   memset(arr, 0, sizeof(*arr));
+   arr->elem_size = elem_size;
+   arr->node_size_log2 = util_logbase2_64(node_size);
+   assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2));
+}
+
+static inline void *
+os_malloc_aligned(size_t size, size_t alignment)
+{
+   void *ptr;
+   alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+   if(posix_memalign(&ptr, alignment, size) != 0)
+      return NULL;
+   return ptr;
+}
+
+static inline void *
+_util_sparse_array_node_data(uintptr_t handle)
+{
+   return (void *)(handle & NODE_PTR_MASK);
+}
+
+static inline unsigned
+_util_sparse_array_node_level(uintptr_t handle)
+{
+   return handle & NODE_LEVEL_MASK;
+}
+
+static inline void
+_util_sparse_array_node_finish(struct util_sparse_array *arr,
+                               uintptr_t node)
+{
+   if (_util_sparse_array_node_level(node) > 0) {
+      uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node);
+      size_t node_size = 1ull << arr->node_size_log2;
+      for (size_t i = 0; i < node_size; i++) {
+         if (children[i])
+            _util_sparse_array_node_finish(arr, children[i]);
+      }
+   }
+
+   os_free_aligned(_util_sparse_array_node_data(node));
+}
+
+static inline uintptr_t
+_util_sparse_array_node(void *data, unsigned level)
+{
+   assert(data != NULL);
+   assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0);
+   assert((level & NODE_PTR_MASK) == 0);
+   return (uintptr_t)data | level;
+}
+
+inline uintptr_t
+_util_sparse_array_node_alloc(struct util_sparse_array *arr,
+                              unsigned level)
+{
+   size_t size;
+   if (level == 0) {
+      size = arr->elem_size << arr->node_size_log2;
+   } else {
+      size = sizeof(uintptr_t) << arr->node_size_log2;
+   }
+
+   void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN);
+   memset(data, 0, size);
+
+   return _util_sparse_array_node(data, level);
+}
+
+static inline uintptr_t
+_util_sparse_array_set_or_free_node(uintptr_t *node_ptr,
+                                    uintptr_t cmp_node,
+                                    uintptr_t node)
+{
+   uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node);
+
+   if (prev_node != cmp_node) {
+      /* We lost the race.  Free this one and return the one that was already
+       * allocated.
+       */
+      os_free_aligned(_util_sparse_array_node_data(node));
+      return prev_node;
+   } else {
+      return node;
+   }
+}
+
+void *
+util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx)
+{
+   const unsigned node_size_log2 = arr->node_size_log2;
+   uintptr_t root = p_atomic_read(&arr->root);
+   if (unlikely(!root)) {
+      unsigned root_level = 0;
+      uint64_t idx_iter = idx >> node_size_log2;
+      while (idx_iter) {
+         idx_iter >>= node_size_log2;
+         root_level++;
+      }
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level);
+      root = _util_sparse_array_set_or_free_node(&arr->root,
+                                                 NULL_NODE, new_root);
+   }
+
+   while (1) {
+      unsigned root_level = _util_sparse_array_node_level(root);
+      uint64_t root_idx = idx >> (root_level * node_size_log2);
+      if (likely(root_idx < (1ull << node_size_log2)))
+         break;
+
+      /* In this case, we have a root but its level is low enough that the
+       * requested index is out-of-bounds.
+       */
+      uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1);
+
+      uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root);
+      new_root_children[0] = root;
+
+      /* We only add one at a time instead of the whole tree because it's
+       * easier to ensure correctness of both the tree building and the
+       * clean-up path.  Because we're only adding one node we never have to
+       * worry about trying to free multiple things without freeing the old
+       * things.
+       */
+      root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root);
+   }
+
+   void *node_data = _util_sparse_array_node_data(root);
+   unsigned node_level = _util_sparse_array_node_level(root);
+   while (node_level > 0) {
+      uint64_t child_idx = (idx >> (node_level * node_size_log2)) &
+                           ((1ull << node_size_log2) - 1);
+
+      uintptr_t *children = (uintptr_t *) node_data;
+      uintptr_t child = p_atomic_read(&children[child_idx]);
+
+      if (unlikely(!child)) {
+         child = _util_sparse_array_node_alloc(arr, node_level - 1);
+         child = _util_sparse_array_set_or_free_node(&children[child_idx],
+                                                     NULL_NODE, child);
+      }
+
+      node_data = _util_sparse_array_node_data(child);
+      node_level = _util_sparse_array_node_level(child);
+   }
+
+   uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
+   return (void *)((char *)node_data + (elem_idx * arr->elem_size));
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
new file mode 100644
index 0000000000000..b094b7b6347c6
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <cstdint>
+#include <cassert>
+#include <cstddef>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define likely(x) __builtin_expect(!!(x), 1)
+
+/** Checks is a value is a power of two. Does not handle zero. */
+#define IS_POT(v) (((v) & ((v) - 1)) == 0)
+
+/** Checks is a value is a power of two. Zero handled. */
+#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+
+
+static inline bool
+util_is_power_of_two_nonzero64(uint64_t v)
+{
+   return IS_POT_NONZERO(v);
+}
+
+static inline uint64_t
+align64(uint64_t value, uint64_t alignment)
+{
+   assert(util_is_power_of_two_nonzero64(alignment));
+   return ALIGN_POT(value, alignment);
+}
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+struct util_sparse_array {
+   size_t elem_size;
+   unsigned node_size_log2;
+
+   uintptr_t root;
+};
+
+void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx);
+void util_sparse_array_init(struct util_sparse_array *arr,
+			    size_t elem_size, size_t node_size);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index f73be2767527d..408b34cba75e2 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <cassert>
 #include <cerrno>
+#include <unistd.h>
 
 #include "virtgpu.h"
 
@@ -30,23 +31,42 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
    gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
 }
 
+void breakpoint() {
+  // break here
+  INFO("BREAKPOINT HERE");
+}
+
 void
 create_virtgpu() {
   struct virtgpu *gpu = new struct virtgpu();
 
+  util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem),
+			 1024);
+
   VkResult result = virtgpu_open(gpu);
-  GGML_ASSERT(result == VK_SUCCESS);
+  assert(result == VK_SUCCESS);
 
   result = virtgpu_init_params(gpu);
-  GGML_ASSERT(result == VK_SUCCESS);
+  assert(result == VK_SUCCESS);
 
   result = virtgpu_init_capset(gpu);
-  GGML_ASSERT(result == VK_SUCCESS);
+  assert(result == VK_SUCCESS);
 
   result = virtgpu_init_context(gpu);
-  GGML_ASSERT(result == VK_SUCCESS);
+  assert(result == VK_SUCCESS);
 
   virtgpu_init_shmem_blob_mem(gpu);
+
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, 16384);
+
+  if (!shmem) {
+    INFO("failed to enumerate DRM devices");
+    assert(false);
+  } else {
+    INFO("Created shm at %p", shmem);
+  }
+
+  breakpoint();
 }
 
 static VkResult
@@ -262,13 +282,6 @@ virtgpu_init_params(struct virtgpu *gpu)
    return VK_SUCCESS;
 }
 
-
-static int
-virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
-{
-   return drmIoctl(gpu->fd, request, args);
-}
-
 static int
 virtgpu_ioctl_context_init(struct virtgpu *gpu,
                            enum virgl_renderer_capset capset_id)
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 618fc5dc6e3b6..f7da4feaab08e 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -3,14 +3,17 @@
 #include <xf86drm.h>
 #include <fcntl.h>
 #include <stdio.h>
-#include <unistd.h>
 #include <stdbool.h>
 #include <threads.h>
 #include <cstring>
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 
-#include "ggml-remoting-frontend.h"
+void breakpoint();
+
+#include "virtgpu-shm.h"
+#include "virtgpu-utils.h"
+
 #define VIRGL_RENDERER_UNSTABLE_APIS 1
 #include "drm-uapi/virtgpu_drm.h"
 #include "virglrenderer_hw.h"
@@ -107,10 +110,7 @@ INFO(const char *format, ...) {
   va_end(argptr);
 }
 
-
 struct virtgpu {
-   //struct vn_renderer base;
-
    struct remoting_dev_instance *instance;
 
    int fd;
@@ -139,17 +139,23 @@ struct virtgpu {
     * res_id is monotonically increasing by default (see
     * virtio_gpu_resource_id_get)
     */
-  //struct util_sparse_array shmem_array;
+  struct util_sparse_array shmem_array;
   // struct util_sparse_array bo_array;
 
    mtx_t dma_buf_import_mutex;
 
-//   struct vn_renderer_shmem_cache shmem_cache;
+  //   struct virtgpu_shmem_cache shmem_cache;
 
    bool supports_cross_device;
 };
 
 
+static inline int
+virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
+{
+   return drmIoctl(gpu->fd, request, args);
+}
+
 void create_virtgpu();
 static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
 static VkResult virtgpu_open(struct virtgpu *gpu);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c
new file mode 100644
index 0000000000000..fc401c13d3003
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c
@@ -0,0 +1,209 @@
+static inline void vn_encode_vkEnumeratePhysicalDevices(struct vn_cs_encoder *enc, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices)
+{
+    const VkCommandTypeEXT cmd_type = VK_COMMAND_TYPE_vkEnumeratePhysicalDevices_EXT;
+
+    vn_encode_VkCommandTypeEXT(enc, &cmd_type);
+    vn_encode_VkFlags(enc, &cmd_flags);
+
+    vn_encode_VkInstance(enc, &instance);
+    if (vn_encode_simple_pointer(enc, pPhysicalDeviceCount))
+        vn_encode_uint32_t(enc, pPhysicalDeviceCount);
+    if (pPhysicalDevices) {
+        vn_encode_array_size(enc, (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0));
+        for (uint32_t i = 0; i < (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0); i++)
+            vn_encode_VkPhysicalDevice(enc, &pPhysicalDevices[i]);
+    } else {
+        vn_encode_array_size(enc, 0);
+    }
+}
+
+static inline struct vn_cs_encoder *
+vn_ring_submit_command_init(struct vn_ring *ring,
+                            struct vn_ring_submit_command *submit,
+                            void *cmd_data,
+                            size_t cmd_size,
+                            size_t reply_size)
+{
+   submit->buffer = VN_CS_ENCODER_BUFFER_INITIALIZER(cmd_data);
+   submit->command = VN_CS_ENCODER_INITIALIZER(&submit->buffer, cmd_size);
+
+   submit->reply_size = reply_size;
+   submit->reply_shmem = NULL;
+
+   submit->ring_seqno_valid = false;
+
+   return &submit->command;
+}
+
+static inline void vn_submit_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices, struct vn_ring_submit_command *submit)
+{
+    uint8_t local_cmd_data[VN_SUBMIT_LOCAL_CMD_SIZE];
+    void *cmd_data = local_cmd_data;
+    size_t cmd_size = vn_sizeof_vkEnumeratePhysicalDevices(instance, pPhysicalDeviceCount, pPhysicalDevices);
+    if (cmd_size > sizeof(local_cmd_data)) {
+        cmd_data = malloc(cmd_size);
+        if (!cmd_data)
+            cmd_size = 0;
+    }
+    const size_t reply_size = cmd_flags & VK_COMMAND_GENERATE_REPLY_BIT_EXT ? vn_sizeof_vkEnumeratePhysicalDevices_reply(instance, pPhysicalDeviceCount, pPhysicalDevices) : 0;
+
+    struct vn_cs_encoder *enc = vn_ring_submit_command_init(vn_ring, submit, cmd_data, cmd_size, reply_size);
+    if (cmd_size) {
+        vn_encode_vkEnumeratePhysicalDevices(enc, cmd_flags, instance, pPhysicalDeviceCount, pPhysicalDevices);
+        vn_ring_submit_command(vn_ring, submit);
+        if (cmd_data != local_cmd_data)
+            free(cmd_data);
+    }
+}
+
+VkResult vn_call_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices)
+{
+    VN_TRACE_FUNC();
+
+    struct vn_ring_submit_command submit;
+    vn_submit_vkEnumeratePhysicalDevices(vn_ring, VK_COMMAND_GENERATE_REPLY_BIT_EXT, instance, pPhysicalDeviceCount, pPhysicalDevices, &submit);
+    struct vn_cs_decoder *dec = vn_ring_get_command_reply(vn_ring, &submit);
+    if (dec) {
+        const VkResult ret = vn_decode_vkEnumeratePhysicalDevices_reply(dec, instance, pPhysicalDeviceCount, pPhysicalDevices);
+        vn_ring_free_command_reply(vn_ring, &submit);
+        return ret;
+    } else {
+        return VK_ERROR_OUT_OF_HOST_MEMORY;
+    }
+}
+
+VkResult
+vn_ring_submit_command_simple(struct vn_ring *ring,
+                              const struct vn_cs_encoder *cs)
+{
+   mtx_lock(&ring->mutex);
+   VkResult result = vn_ring_submit_locked(ring, cs, NULL, NULL);
+   mtx_unlock(&ring->mutex);
+
+   return result;
+}
+
+static VkResult
+vn_ring_submit_locked(struct vn_ring *ring,
+                      const struct vn_cs_encoder *cs,
+                      struct vn_renderer_shmem *extra_shmem,
+                      uint32_t *ring_seqno)
+{
+   const bool direct = vn_ring_submission_can_direct(ring, cs);
+   if (!direct && cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER) {
+      cs = vn_ring_cs_upload_locked(ring, cs);
+      if (!cs)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      assert(cs->storage_type != VN_CS_ENCODER_STORAGE_POINTER);
+   }
+
+   struct vn_ring_submission submit;
+   VkResult result =
+      vn_ring_submission_prepare(ring, &submit, cs, extra_shmem, direct);
+   if (result != VK_SUCCESS)
+      return result;
+
+   uint32_t seqno;
+   const bool notify =
+      vn_ring_submit_internal(ring, submit.submit, submit.cs, &seqno);
+   if (notify) {
+      uint32_t notify_ring_data[8];
+      struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
+         notify_ring_data, sizeof(notify_ring_data));
+      vn_encode_vkNotifyRingMESA(&local_enc, 0, ring->id, seqno, 0);
+      vn_renderer_submit_simple(ring->instance->renderer, notify_ring_data,
+                                vn_cs_encoder_get_len(&local_enc));
+   }
+
+   vn_ring_submission_cleanup(&submit);
+
+   if (ring_seqno)
+      *ring_seqno = seqno;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+vn_ring_submission_prepare(struct vn_ring *ring,
+                           struct vn_ring_submission *submit,
+                           const struct vn_cs_encoder *cs,
+                           struct vn_renderer_shmem *extra_shmem,
+                           bool direct)
+{
+   submit->cs = vn_ring_submission_get_cs(submit, cs, direct);
+   if (!submit->cs)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   submit->submit =
+      vn_ring_submission_get_ring_submit(ring, cs, extra_shmem, direct);
+   if (!submit->submit) {
+      vn_ring_submission_cleanup(submit);
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+   }
+
+   return VK_SUCCESS;
+}
+
+static bool
+vn_ring_submit_internal(struct vn_ring *ring,
+                        struct vn_ring_submit *submit,
+                        const struct vn_cs_encoder *cs,
+                        uint32_t *seqno)
+{
+   /* write cs to the ring */
+   assert(!vn_cs_encoder_is_empty(cs));
+
+   /* avoid -Wmaybe-unitialized */
+   uint32_t cur_seqno = 0;
+
+   for (uint32_t i = 0; i < cs->buffer_count; i++) {
+      const struct vn_cs_encoder_buffer *buf = &cs->buffers[i];
+      cur_seqno = vn_ring_wait_space(ring, buf->committed_size);
+      vn_ring_write_buffer(ring, buf->base, buf->committed_size);
+   }
+
+   vn_ring_store_tail(ring);
+   const VkRingStatusFlagsMESA status = vn_ring_load_status(ring);
+   if (status & VK_RING_STATUS_FATAL_BIT_MESA) {
+      vn_log(NULL, "vn_ring_submit abort on fatal");
+      abort();
+   }
+
+   vn_ring_retire_submits(ring, cur_seqno);
+
+   submit->seqno = ring->cur;
+   list_addtail(&submit->head, &ring->submits);
+
+   *seqno = submit->seqno;
+
+   /* Notify renderer to wake up idle ring if at least VN_RING_IDLE_TIMEOUT_NS
+    * has passed since the last sent notification to avoid excessive wake up
+    * calls (non-trivial since submitted via virtio-gpu kernel).
+    */
+   if (status & VK_RING_STATUS_IDLE_BIT_MESA) {
+      const int64_t now = os_time_get_nano();
+      if (os_time_timeout(ring->last_notify, ring->next_notify, now)) {
+         ring->last_notify = now;
+         ring->next_notify = now + VN_RING_IDLE_TIMEOUT_NS;
+         return true;
+      }
+   }
+   return false;
+}
+
+static void
+vn_ring_write_buffer(struct vn_ring *ring, const void *data, uint32_t size)
+{
+   assert(ring->cur + size - vn_ring_load_head(ring) <= ring->buffer_size);
+
+   const uint32_t offset = ring->cur & ring->buffer_mask;
+   if (offset + size <= ring->buffer_size) {
+      memcpy(ring->shared.buffer + offset, data, size);
+   } else {
+      const uint32_t s = ring->buffer_size - offset;
+      memcpy(ring->shared.buffer + offset, data, s);
+      memcpy(ring->shared.buffer, data + s, size - s);
+   }
+
+   ring->cur += size;
+}

From a87e39888c4c58e2dc00e721bdb4acea2de20684 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 11 Apr 2025 09:19:24 +0200
Subject: [PATCH 011/117] run.remoting: cleanup the screen before running

---
 run.remoting.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.remoting.sh b/run.remoting.sh
index c6fbdaac435a5..b7175a78aab4c 100755
--- a/run.remoting.sh
+++ b/run.remoting.sh
@@ -1,5 +1,5 @@
 #! /bin/bash
-
+clear
 if [[ ${1:-} == "gdb" ]]; then
     prefix="gdb --args"
 else

From 3d7b19d64e12a0cc3c732904671a251aff1f8ac8 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 11 Apr 2025 09:19:40 +0200
Subject: [PATCH 012/117] Reduce the verbose logging

---
 src/llama-context.cpp      | 4 ++--
 src/llama-model-loader.cpp | 5 +++--
 src/llama-model.cpp        | 5 +++--
 src/llama-vocab.cpp        | 2 ++
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4735e98ea040f..8144ba4ebeae7 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -96,7 +96,7 @@ llama_context::llama_context(
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
     const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
-
+/*
     LLAMA_LOG_INFO("%s: n_seq_max     = %u\n",   __func__, cparams.n_seq_max);
     LLAMA_LOG_INFO("%s: n_ctx         = %u\n",   __func__, cparams.n_ctx);
     LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n",   __func__, n_ctx_per_seq);
@@ -106,7 +106,7 @@ llama_context::llama_context(
     LLAMA_LOG_INFO("%s: flash_attn    = %d\n",   __func__, cparams.flash_attn);
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n", __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",   __func__, cparams.rope_freq_scale);
-
+*/
     if (n_ctx_per_seq < hparams.n_ctx_train) {
         LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 36f8d1cbf0323..bb8b090950072 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -651,7 +651,7 @@ llama_model_loader::llama_model_loader(
             }
         }
 
-        LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        //LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
 
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(meta.get(), i);
@@ -677,7 +677,7 @@ llama_model_loader::llama_model_loader(
                 continue;
             }
 
-            LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -1119,6 +1119,7 @@ std::string llama_model_loader::ftype_name() const {
 }
 
 void llama_model_loader::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
     LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
     if (n_bytes < GiB) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9e4166a71c641..a431c81996bc9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1458,12 +1458,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
         const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
+            //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
         }
         const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
         auto * dev = devices.at(layer_gpu);
-        LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
+        //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa);
         return {dev, &pimpl->gpu_buft_list.at(dev)};
     };
 
@@ -4144,6 +4144,7 @@ uint64_t llama_model::n_elements() const {
 }
 
 void llama_model::print_info() const {
+    return;
     const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
 
     auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a9c24e78812ac..f454e2aa895b5 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2731,6 +2731,7 @@ int32_t llama_vocab::impl::detokenize(
 }
 
 void llama_vocab::impl::print_info() const {
+    return;
     LLAMA_LOG_INFO("%s: vocab type       = %s\n",     __func__, type_name().c_str());
     LLAMA_LOG_INFO("%s: n_vocab          = %u\n",     __func__, vocab.n_tokens());
     LLAMA_LOG_INFO("%s: n_merges         = %u\n",     __func__, (uint32_t) bpe_ranks.size());
@@ -3055,6 +3056,7 @@ std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, boo
 }
 
 void llama_vocab::print_info() const {
+    return;
     pimpl->print_info();
 }
 

From 4419c955deeb2708e988f93974d67b9b32779f1b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 2 May 2025 09:27:46 +0200
Subject: [PATCH 013/117] Trace the executionpath

---
 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
index f027860407a4e..bd1568add1752 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -53,7 +53,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
       .handle = gem_handle,
       .pad = 0,
    };
-   printf("Say hello world\n");
+   printf("virtgpu_ioctl_map(%ld)\n", size);
    if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
       return NULL;
 
@@ -61,7 +61,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
                     args.offset);
    if (ptr == MAP_FAILED)
       return NULL;
-
+   printf("virtgpu_ioctl_map(%ld) --> %p | %p\n", size, ptr, *(void **)ptr);
    return ptr;
 }
 

From a25b672a5b737c43f7bda05865ca9fe560119ab4 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 2 May 2025 09:28:42 +0200
Subject: [PATCH 014/117] virtgpu: abort early

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 408b34cba75e2..c8be37bc57301 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -3,6 +3,8 @@
 #include <cerrno>
 #include <unistd.h>
 
+#include <cstdlib>
+
 #include "virtgpu.h"
 
 static inline void
@@ -31,9 +33,13 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
    gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
 }
 
-void breakpoint() {
+void *something = NULL;
+void breakpoint () {
   // break here
   INFO("BREAKPOINT HERE");
+  if (!something) { // avoid the [[noreturn]] detection mechanism
+    exit(0);
+  }
 }
 
 void

From 5febf22ee49e55a2d56c43c3420e4fbdc231c6da Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 2 May 2025 09:29:10 +0200
Subject: [PATCH 015/117] virtgpu: add the virtgpu_submit to kick a command on
 the host

---
 .../src/ggml-remotingfrontend/virtgpu-utils.h | 12 +++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    | 86 +++++++++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  1 +
 3 files changed, 99 insertions(+)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index b094b7b6347c6..7bea1798f0ebb 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -3,6 +3,9 @@
 #include <cstdint>
 #include <cassert>
 #include <cstddef>
+#include <ctime>
+#include <cerrno>
+#include <atomic>
 
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -48,3 +51,12 @@ struct util_sparse_array {
 void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx);
 void util_sparse_array_init(struct util_sparse_array *arr,
 			    size_t elem_size, size_t node_size);
+
+inline void
+os_time_sleep(int64_t usecs)
+{
+   struct timespec time;
+   time.tv_sec = usecs / 1000000;
+   time.tv_nsec = (usecs % 1000000) * 1000;
+   while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index c8be37bc57301..55722e6eb8fa0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -72,6 +72,8 @@ create_virtgpu() {
     INFO("Created shm at %p", shmem);
   }
 
+  virtgpu_submit(gpu, shmem);
+
   breakpoint();
 }
 
@@ -347,3 +349,87 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
    const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args);
    return ret ? 0 : val;
 }
+
+
+
+#define PK_COMMAND_TYPE_pkCreateThread 255
+
+static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
+{
+
+  /*
+   * Data passed to the host
+   */
+  int32_t command[3];
+  // command identifier
+  command[0] = PK_COMMAND_TYPE_pkCreateThread;
+  command[1] = 0; // ?
+  // arguments
+  command[2] = shmem->res_id;
+
+  /*
+   * Reply notification pointer
+   */
+
+  volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) shmem->mmap_ptr;
+  *atomic_reply_notif = 0;
+
+  /*
+   * Trigger the execbuf ioctl
+   */
+
+  struct drm_virtgpu_execbuffer args = {
+    .flags = VIRTGPU_EXECBUF_RING_IDX,
+    .size =  sizeof(command),
+    .command = (uintptr_t) &command,
+
+    .bo_handles = 0,
+    .num_bo_handles = 0,
+
+    .fence_fd = 0,
+    .ring_idx = 0,
+    .syncobj_stride = 0,
+    .num_in_syncobjs = 0,
+    .num_out_syncobjs = 0,
+    .in_syncobjs = 0,
+    .out_syncobjs = 0,
+  };
+
+  int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
+
+  /*
+   * Wait for the response notification
+   */
+
+  int resp = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
+  printf("waiting for the response ... | %d | %p\n", resp, (void*) atomic_reply_notif);
+
+  while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) {
+    int64_t base_sleep_us = 160;
+
+    os_time_sleep(base_sleep_us);
+  }
+  printf("got the response!\n");
+  /*
+   * Read the reply
+   */
+
+  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[1]);
+  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[2]);
+  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[3]);
+
+#if 0
+  VkCommandTypeEXT command_type;
+  vn_decode_VkCommandTypeEXT(dec, &command_type);
+  assert(command_type == VK_COMMAND_TYPE_vkEnumerateInstanceVersion_EXT);
+  VkResult ret;
+  vn_decode_VkResult(dec, &ret);
+  if (vn_decode_simple_pointer(dec)) {
+    vn_decode_uint32_t(dec, pApiVersion);
+  } else {
+    pApiVersion = NULL;
+  }
+#endif
+
+  return ret;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index f7da4feaab08e..66c40a05b4909 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -175,3 +175,4 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
                        size_t capset_size);
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
+static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem);

From 2ee2a4d4acf1b02e226e1c2dc01c91ce7860fd8f Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 2 May 2025 12:09:58 +0200
Subject: [PATCH 016/117] podman_compile.sh: add compile helper

---
 podman_compile.sh | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100755 podman_compile.sh

diff --git a/podman_compile.sh b/podman_compile.sh
new file mode 100755
index 0000000000000..47e4baee07037
--- /dev/null
+++ b/podman_compile.sh
@@ -0,0 +1,34 @@
+#! /bin/bash
+
+
+set -o pipefail
+set -o errexit
+set -o nounset
+set -o errtrace
+
+opts=""
+opts="$opts --device /dev/dri "
+echo "Running with the GPU passthrough"
+
+image=localhost/pytorch:remoting
+
+what=${1:-}
+if [[ -z "$what" ]]; then
+    what=remoting
+fi
+
+cmd="bash ./build.$what.sh"
+
+set -x
+podman run \
+--name mac_ai_compiling \
+--user root:root \
+--cgroupns host \
+--security-opt label=disable \
+--env HOME="$HOME" \
+-v "$HOME":"$HOME":Z \
+-w "$PWD" \
+-it --rm \
+$opts \
+$image \
+$cmd

From 847a2adff36abbc6ecdabeff5ae01145c8a26985 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 5 May 2025 16:07:57 +0200
Subject: [PATCH 017/117] virtgpu: move the logging functions to virtgpu-utils

---
 .../src/ggml-remotingfrontend/virtgpu-utils.h | 23 +++++++++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  9 --------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index 7bea1798f0ebb..9d1589c9128ab 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -6,6 +6,9 @@
 #include <ctime>
 #include <cerrno>
 #include <atomic>
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
 
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -21,6 +24,26 @@
 
 #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
 
+inline void
+INFO(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+FATAL(const char *format, ...) {
+  fprintf(stderr, "FATAL: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+  exit(1);
+}
 
 static inline bool
 util_is_power_of_two_nonzero64(uint64_t v)
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 66c40a05b4909..03e9b97b84173 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -101,15 +101,6 @@ vn_log(struct remoting_dev_instance *instance, const char *format, ...)
    PRINTFLIKE(2, 3);
 
 
-inline void
-INFO(const char *format, ...) {
-  va_list argptr;
-  va_start(argptr, format);
-  vfprintf(stderr, format, argptr);
-  fprintf(stderr, "\n");
-  va_end(argptr);
-}
-
 struct virtgpu {
    struct remoting_dev_instance *instance;
 

From 3270cf9c399f1f9dfdf4a0d1bbbb552f9bbb45ea Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 5 May 2025 16:08:31 +0200
Subject: [PATCH 018/117] virtgpu: use venus CS functions

---
 .../src/ggml-remotingfrontend/virtgpu-types.h | 298 ++++++++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    |  74 +++--
 2 files changed, 343 insertions(+), 29 deletions(-)
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-types.h

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-types.h b/ggml/src/ggml-remotingfrontend/virtgpu-types.h
new file mode 100644
index 0000000000000..b0802ad634bcb
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-types.h
@@ -0,0 +1,298 @@
+#pragma once
+#include "virtgpu.h"
+
+struct vn_cs_encoder {
+   char* cur;
+   const char* end;
+};
+
+struct vn_cs_decoder {
+  const char* cur;
+  const char* end;
+};
+
+/*
+ * encode peek
+ */
+
+static inline bool
+vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
+                            size_t size,
+                            void *val,
+                            size_t val_size)
+{
+   assert(val_size <= size);
+
+   if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+      FATAL("DECODER IS FULL :/");
+      //vn_cs_decoder_set_fatal(dec);
+      memset(val, 0, val_size);
+      return false;
+   }
+
+   /* we should not rely on the compiler to optimize away memcpy... */
+   memcpy(val, dec->cur, val_size);
+   return true;
+}
+
+static inline void
+vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+   vn_cs_decoder_peek_internal(dec, size, val, val_size);
+}
+
+/*
+ * read/write
+ */
+
+static inline void
+vn_cs_decoder_read(struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+   if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
+      dec->cur += size;
+}
+
+static inline void
+vn_cs_encoder_write(struct vn_cs_encoder *enc,
+                    size_t size,
+                    const void *val,
+                    size_t val_size)
+{
+   assert(val_size <= size);
+   assert(size <= ((size_t) (enc->end - enc->cur)));
+
+   /* we should not rely on the compiler to optimize away memcpy... */
+   memcpy(enc->cur, val, val_size);
+   enc->cur += size;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void
+vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
+{
+   assert(size % 4 == 0);
+   vn_cs_decoder_read(dec, size, data, data_size);
+}
+
+static inline void
+vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
+{
+   assert(size % 4 == 0);
+   /* TODO check if the generated code is optimal */
+   vn_cs_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint64_t */
+
+static inline size_t
+vn_sizeof_uint64_t(const uint64_t *val)
+{
+    assert(sizeof(*val) == 8);
+    return 8;
+}
+
+static inline void
+vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
+{
+    vn_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
+{
+    vn_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 8);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}
+
+/* int32_t */
+
+static inline size_t
+vn_sizeof_int32_t(const int32_t *val)
+{
+    assert(sizeof(*val) == 4);
+    return 4;
+}
+
+static inline void
+vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
+{
+    vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
+{
+    vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 4);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline size_t
+vn_sizeof_array_size(uint64_t size)
+{
+    return vn_sizeof_uint64_t(&size);
+}
+
+static inline void
+vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
+{
+    vn_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t
+vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
+{
+    uint64_t size;
+    vn_decode_uint64_t(dec, &size);
+    if (size != expected_size) {
+        FATAL("ENCODER IS FULL :/");
+        //vn_cs_decoder_set_fatal(dec);
+        size = 0;
+    }
+    return size;
+}
+
+static inline uint64_t
+vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
+{
+    uint64_t size;
+    vn_decode_uint64_t(dec, &size);
+    return size;
+}
+
+static inline uint64_t
+vn_peek_array_size(struct vn_cs_decoder *dec)
+{
+    uint64_t size;
+    vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
+    return size;
+}
+
+/* non-array pointer */
+
+static inline size_t
+vn_sizeof_simple_pointer(const void *val)
+{
+    return vn_sizeof_array_size(val ? 1 : 0);
+}
+
+static inline bool
+vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
+{
+    vn_encode_array_size(enc, val ? 1 : 0);
+    return val;
+}
+
+static inline bool
+vn_decode_simple_pointer(struct vn_cs_decoder *dec)
+{
+    return vn_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline size_t
+vn_sizeof_uint32_t(const uint32_t *val)
+{
+    assert(sizeof(*val) == 4);
+    return 4;
+}
+
+static inline void
+vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
+{
+    vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
+{
+    vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 4);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 55722e6eb8fa0..37bf98a6e8bb5 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -6,6 +6,7 @@
 #include <cstdlib>
 
 #include "virtgpu.h"
+#include "virtgpu-types.h"
 
 static inline void
 virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
@@ -358,14 +359,31 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
 {
 
   /*
-   * Data passed to the host
+   * Prepare the command encoder buffer
    */
-  int32_t command[3];
-  // command identifier
-  command[0] = PK_COMMAND_TYPE_pkCreateThread;
-  command[1] = 0; // ?
-  // arguments
-  command[2] = shmem->res_id;
+
+  char encoder_buffer[4096];
+
+  struct vn_cs_encoder _encoder = {
+    encoder_buffer,
+    encoder_buffer + sizeof(encoder_buffer),
+  };
+  struct vn_cs_encoder *encoder = &_encoder;
+
+  /*
+   * Fill the command encoder buffer
+   */
+
+  /* VkCommandTypeEXT is int32_t */
+  int32_t cmd_type = PK_COMMAND_TYPE_pkCreateThread;
+  vn_encode_int32_t(encoder, &cmd_type);
+  int32_t cmd_flags = 0x0;
+  vn_encode_int32_t(encoder, &cmd_flags);
+
+  uint32_t reply_res_id = shmem->res_id;
+  vn_encode_uint32_t(encoder, &reply_res_id);
+
+  printf("call pkCreateThread(flags=0x%x, reply_buf=%d)\n", cmd_flags, reply_res_id);
 
   /*
    * Reply notification pointer
@@ -380,8 +398,8 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
 
   struct drm_virtgpu_execbuffer args = {
     .flags = VIRTGPU_EXECBUF_RING_IDX,
-    .size =  sizeof(command),
-    .command = (uintptr_t) &command,
+    .size = sizeof(encoder_buffer),
+    .command = (uintptr_t) encoder_buffer,
 
     .bo_handles = 0,
     .num_bo_handles = 0,
@@ -401,35 +419,33 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
    * Wait for the response notification
    */
 
-  int resp = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire);
-  printf("waiting for the response ... | %d | %p\n", resp, (void*) atomic_reply_notif);
-
   while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) {
     int64_t base_sleep_us = 160;
 
     os_time_sleep(base_sleep_us);
   }
-  printf("got the response!\n");
+
   /*
    * Read the reply
    */
 
-  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[1]);
-  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[2]);
-  printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[3]);
-
-#if 0
-  VkCommandTypeEXT command_type;
-  vn_decode_VkCommandTypeEXT(dec, &command_type);
-  assert(command_type == VK_COMMAND_TYPE_vkEnumerateInstanceVersion_EXT);
-  VkResult ret;
-  vn_decode_VkResult(dec, &ret);
-  if (vn_decode_simple_pointer(dec)) {
-    vn_decode_uint32_t(dec, pApiVersion);
-  } else {
-    pApiVersion = NULL;
-  }
-#endif
+  struct vn_cs_decoder _dec = {
+    .cur = (char *) shmem->mmap_ptr + sizeof(*atomic_reply_notif),
+    .end = (char *) shmem->mmap_ptr + shmem->mmap_size,
+  };
+  struct vn_cs_decoder *dec = &_dec;
+
+  uint32_t apiVersion;
+  vn_decode_uint32_t(dec, &apiVersion);
+  printf("pkCreateThread() --> 0x%x\n", apiVersion);
+  vn_decode_uint32_t(dec, &apiVersion);
+  printf("pkCreateThread() --> 0x%x\n", apiVersion);
+  vn_decode_uint32_t(dec, &apiVersion);
+  printf("pkCreateThread() --> 0x%x\n", apiVersion);
+
+  int32_t vk_ret;
+  vn_decode_int32_t(dec, &vk_ret);
+  printf("pkCreateThread() --> ret=%d\n", vk_ret);
 
   return ret;
 }

From 151c0ae893564b88a512ec21cfc6fd41cc7e3d46 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 10:02:22 +0200
Subject: [PATCH 019/117] virtgpu: make more generic

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 63 +++++++++++-----------
 ggml/src/ggml-remotingfrontend/virtgpu.h   | 19 ++++++-
 2 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 37bf98a6e8bb5..e251b29577616 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -35,9 +35,9 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
 }
 
 void *something = NULL;
-void breakpoint () {
+void thks_bye () {
   // break here
-  INFO("BREAKPOINT HERE");
+  INFO("thks bye, stopping early.");
   if (!something) { // avoid the [[noreturn]] detection mechanism
     exit(0);
   }
@@ -64,18 +64,17 @@ create_virtgpu() {
 
   virtgpu_init_shmem_blob_mem(gpu);
 
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, 16384);
+  gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384);
 
-  if (!shmem) {
-    INFO("failed to enumerate DRM devices");
+  if (!gpu->reply_shmem) {
+    FATAL("%s: failed to create the reply shared memory page :/", __func__);
     assert(false);
-  } else {
-    INFO("Created shm at %p", shmem);
   }
 
-  virtgpu_submit(gpu, shmem);
+  remote_call(gpu, PK_COMMAND_TYPE_LoadLibrary, 0);
+  remote_call(gpu, PK_COMMAND_TYPE_SayHello, 0);
 
-  breakpoint();
+  thks_bye();
 }
 
 static VkResult
@@ -352,10 +351,11 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
 }
 
 
-
-#define PK_COMMAND_TYPE_pkCreateThread 255
-
-static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
+static int remote_call(
+  struct virtgpu *gpu,
+  int32_t cmd_type,
+  int32_t cmd_flags
+  )
 {
 
   /*
@@ -375,21 +375,25 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
    */
 
   /* VkCommandTypeEXT is int32_t */
-  int32_t cmd_type = PK_COMMAND_TYPE_pkCreateThread;
   vn_encode_int32_t(encoder, &cmd_type);
-  int32_t cmd_flags = 0x0;
   vn_encode_int32_t(encoder, &cmd_flags);
 
-  uint32_t reply_res_id = shmem->res_id;
+  if (!gpu->reply_shmem) {
+    FATAL("%s: the reply shmem page can't be null", __func__);
+  }
+
+  uint32_t reply_res_id = gpu->reply_shmem->res_id;
   vn_encode_uint32_t(encoder, &reply_res_id);
 
-  printf("call pkCreateThread(flags=0x%x, reply_buf=%d)\n", cmd_flags, reply_res_id);
+  printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__,
+	 command_name(cmd_type),
+	 cmd_flags, reply_res_id);
 
   /*
    * Reply notification pointer
    */
 
-  volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) shmem->mmap_ptr;
+  volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr;
   *atomic_reply_notif = 0;
 
   /*
@@ -415,6 +419,9 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
 
   int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args);
 
+  if (ret != 0) {
+    FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret);
+  }
   /*
    * Wait for the response notification
    */
@@ -430,22 +437,16 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem)
    */
 
   struct vn_cs_decoder _dec = {
-    .cur = (char *) shmem->mmap_ptr + sizeof(*atomic_reply_notif),
-    .end = (char *) shmem->mmap_ptr + shmem->mmap_size,
+    .cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif),
+    .end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size,
   };
   struct vn_cs_decoder *dec = &_dec;
 
-  uint32_t apiVersion;
-  vn_decode_uint32_t(dec, &apiVersion);
-  printf("pkCreateThread() --> 0x%x\n", apiVersion);
-  vn_decode_uint32_t(dec, &apiVersion);
-  printf("pkCreateThread() --> 0x%x\n", apiVersion);
-  vn_decode_uint32_t(dec, &apiVersion);
-  printf("pkCreateThread() --> 0x%x\n", apiVersion);
+  int32_t rmt_call_ret;
+  vn_decode_int32_t(dec, &rmt_call_ret);
 
-  int32_t vk_ret;
-  vn_decode_int32_t(dec, &vk_ret);
-  printf("pkCreateThread() --> ret=%d\n", vk_ret);
+  printf("%s: call %s() --> %d\n", __func__,
+	 command_name(cmd_type), rmt_call_ret);
 
-  return ret;
+  return rmt_call_ret;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 03e9b97b84173..f3249207d85ad 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -9,7 +9,7 @@
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 
-void breakpoint();
+void thks_bye();
 
 #include "virtgpu-shm.h"
 #include "virtgpu-utils.h"
@@ -138,6 +138,9 @@ struct virtgpu {
   //   struct virtgpu_shmem_cache shmem_cache;
 
    bool supports_cross_device;
+
+  /* KP */
+  struct vn_renderer_shmem *reply_shmem;
 };
 
 
@@ -166,4 +169,16 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
                        size_t capset_size);
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
-static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem);
+static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags);
+
+#define PK_COMMAND_TYPE_LoadLibrary 255
+#define PK_COMMAND_TYPE_SayHello 256
+
+static inline const char *command_name(int32_t type)
+{
+  switch (type) {
+  case PK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
+  case PK_COMMAND_TYPE_SayHello: return "SayHello";
+  default: return "unknown";
+  }
+}

From 52d8e4220cf5b2d2489e55b1ba3bf79efd03a063 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 10:28:28 +0200
Subject: [PATCH 020/117] ggml-remotingfrontend: fix and make more generic

---
 ggml/CMakeLists.txt                                    |  2 +-
 ggml/include/ggml-remoting-frontend.h                  |  4 ++--
 ggml/src/ggml-backend-reg.cpp                          |  2 +-
 ggml/src/ggml-remotingfrontend/CMakeLists.txt          | 10 +---------
 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp    |  6 +++---
 ggml/src/ggml-remotingfrontend/ggml-backend.cpp        |  2 +-
 .../ggml-remotingfrontend/ggml-host-buffer-type.cpp    |  4 ++--
 7 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 24c47aea122a2..6db2c2ee3f2f5 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -270,7 +270,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
-    include/ggml-remoting-frontend.h
+    ggml/include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h
index c32c283820dea..4c7cd585ea4af 100644
--- a/ggml/include/ggml-remoting-frontend.h
+++ b/ggml/include/ggml-remoting-frontend.h
@@ -7,9 +7,9 @@
 extern "C" {
 #endif
 
-#define GGML_REMOTING_NAME "RemotingFrontend"
+#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend"
 
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_reg();
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg();
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 8ed3c36362bcd..45843e5ad190a 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -177,7 +177,7 @@ struct ggml_backend_registry {
         register_backend(ggml_backend_vk_reg());
 #endif
 #ifdef GGML_USE_REMOTINGFRONTEND
-        register_backend(ggml_backend_remoting_reg());
+        register_backend(ggml_backend_remoting_frontend_reg());
 #endif
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 778fddd89a164..678623f972fc1 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -1,15 +1,7 @@
 cmake_minimum_required(VERSION 3.19)
 cmake_policy(SET CMP0114 NEW)
 
-# function(detect_host_compiler)
-#     find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH)
-#     find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
-
-#     set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE)
-#     set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
-# endfunction()
-
-message(STATUS "Enable API Remoting frontend found")
+message(STATUS "Enable API Remoting frontend")
 
 ggml_add_backend_library(ggml-remotingfrontend
                          ggml-backend-buffer.cpp
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 00dddf23f2898..cb77a31a037c8 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -29,7 +29,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
                 char desc[256] = "API Remoting device";
 
                 ctx->device = i;
-                ctx->name = GGML_REMOTING_NAME + std::to_string(i);
+                ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
                 ctx->description = desc;
                 devices.push_back(new ggml_backend_device {
                     /* .iface   = */ ggml_backend_remoting_device_i,
@@ -47,7 +47,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
 
 static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
     UNUSED(reg);
-    return GGML_REMOTING_NAME;
+    return GGML_REMOTING_FRONTEND_NAME;
 }
 
 static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
@@ -57,7 +57,7 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
     /* .get_proc_address = */ NULL,
 };
 
-ggml_backend_reg_t ggml_backend_remoting_reg() {
+ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
     static ggml_backend_reg reg = {
         /* .api_version = */ GGML_BACKEND_API_VERSION,
         /* .iface       = */ ggml_backend_remoting_reg_i,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 2618e48929cba..aac17a762ff9b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -46,7 +46,7 @@ ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const c
     ggml_backend_t remoting_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_remoting_guid(),
         /* .interface = */ ggml_backend_remoting_interface,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device),
+        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device),
         /* .context   = */ ctx,
     };
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
index b40c72b8d1e8b..fbf5569788c40 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
@@ -3,7 +3,7 @@
 // host buffer type
 
 static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_REMOTING_NAME "_Host";
+    return GGML_REMOTING_FRONTEND_NAME "_Host";
 
     UNUSED(buft);
 }
@@ -43,7 +43,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
         },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0),
+        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0),
         /* .context  = */ nullptr,
     };
 

From d118515e0600a4cbb38d61cc6201d59e7d3f933f Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 10:29:30 +0200
Subject: [PATCH 021/117] prepare.backend.sh: helper script

---
 prepare.backend.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100755 prepare.backend.sh

diff --git a/prepare.backend.sh b/prepare.backend.sh
new file mode 100755
index 0000000000000..a51f2465b6733
--- /dev/null
+++ b/prepare.backend.sh
@@ -0,0 +1,5 @@
+cmake -S . -B ../build.remoting-backend \
+      -DGGML_REMOTINGBACKEND=ON \
+      -DGGML_NATIVE=OFF \
+      -DCMAKE_BUILD_TYPE=Debug \
+      "$@"

From a54229d40b8f675f03a639036447f01adf1a2796 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 10:30:00 +0200
Subject: [PATCH 022/117] build.backend.sh: helper script

---
 build.backend.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100755 build.backend.sh

diff --git a/build.backend.sh b/build.backend.sh
new file mode 100755
index 0000000000000..b32c24b9ba035
--- /dev/null
+++ b/build.backend.sh
@@ -0,0 +1,13 @@
+# force isatty-->true, so that $0 |& head -50 has colors ...
+rm -f READY_backend FAILED_backend
+
+echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
+export LD_PRELOAD=/tmp/isatty.so
+
+cmake --build ../build.remoting-backend --parallel 8 --target llama-cli "$@"
+
+if [[ $? == 0 ]]; then
+    touch READY_backend
+else
+    touch FAILED_backend
+fi

From 78d16d047b985c3062e4fe045b2491bd695b5e0b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 10:28:10 +0200
Subject: [PATCH 023/117] build: integrate the remoting-backend skeleton

---
 CMakePresets.json                             |  1 +
 Makefile                                      |  8 ++
 ggml/CMakeLists.txt                           |  2 +
 ggml/include/ggml-remoting-backend.h          | 16 ++++
 ggml/src/CMakeLists.txt                       |  1 +
 ggml/src/ggml-backend-reg.cpp                 |  7 ++
 ggml/src/ggml-remotingbackend/CMakeLists.txt  | 11 +++
 .../ggml-remotingbackend/backend-internal.h   | 30 +++++++
 ggml/src/ggml-remotingbackend/backend.cpp     | 78 +++++++++++++++++++
 9 files changed, 154 insertions(+)
 create mode 100644 ggml/include/ggml-remoting-backend.h
 create mode 100644 ggml/src/ggml-remotingbackend/CMakeLists.txt
 create mode 100644 ggml/src/ggml-remotingbackend/backend-internal.h
 create mode 100644 ggml/src/ggml-remotingbackend/backend.cpp

diff --git a/CMakePresets.json b/CMakePresets.json
index c5369a47f6bf9..5296aae76e74a 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -31,6 +31,7 @@
     { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
     { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
     { "name": "remoting_frontend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND":      "ON" } },
+    { "name": "remoting_backend",   "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND":      "ON" } },
 
     {
         "name": "x64-windows-llvm", "hidden": true,
diff --git a/Makefile b/Makefile
index ebf9f79ed5598..18d73ae9de685 100644
--- a/Makefile
+++ b/Makefile
@@ -721,6 +721,11 @@ ifdef GGML_REMOTING_FRONTEND
 	OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o
 endif
 
+ifdef GGML_REMOTING_BACKEND
+	MK_CPPFLAGS  += -DGGML_USE_REMOTINGBACKEND
+	OBJ_GGML_EXT += ggml/src/ggml-remotingbackend/ggml-remoting-backend.o
+endif
+
 ifdef GGML_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
@@ -763,6 +768,9 @@ ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-v
 ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+ggml/src/ggml-remotingbackend/backend.o: ggml/src/ggml-remotingbackend/backend.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 $(_ggml_vk_header): $(_ggml_vk_source)
 
 $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 6db2c2ee3f2f5..9d7576c911635 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -180,6 +180,7 @@ option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_REMOTING_FRONTEND               "ggml: use the API Remoting frontend"             OFF)
+option(GGML_REMOTING_BACKEND                "ggml: use the API Remoting backend"              OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
@@ -271,6 +272,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     ggml/include/ggml-remoting-frontend.h
+    ggml/include/ggml-remoting-backend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-remoting-backend.h b/ggml/include/ggml-remoting-backend.h
new file mode 100644
index 0000000000000..25a9dc269c957
--- /dev/null
+++ b/ggml/include/ggml-remoting-backend.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_REMOTING_BACKEND_NAME "RemotingBackend"
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_backend_reg();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 76c3f3d27fc16..63f36e67a00bb 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -310,6 +310,7 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(RemotingFrontend)
+ggml_add_backend(RemotingBackend)
 ggml_add_backend(OpenCL)
 
 foreach (target ggml-base ggml)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 45843e5ad190a..7e6d4f8c36f67 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -49,6 +49,10 @@
 #include "ggml-remoting-frontend.h"
 #endif
 
+#ifdef GGML_USE_REMOTINGBACKEND
+#include "ggml-remoting-backend.h"
+#endif
+
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -179,6 +183,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_REMOTINGFRONTEND
         register_backend(ggml_backend_remoting_frontend_reg());
 #endif
+#ifdef GGML_USE_REMOTINGBACKEND
+        register_backend(ggml_backend_remoting_backend_reg());
+#endif
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
new file mode 100644
index 0000000000000..70b8d3a1b7fef
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.19)
+cmake_policy(SET CMP0114 NEW)
+
+message(STATUS "Enable API Remoting backend")
+
+ggml_add_backend_library(ggml-remotingbackend
+                         backend.cpp
+                         ../../include/ggml-remoting-backend.h
+                        )
+
+target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
new file mode 100644
index 0000000000000..97e9605b0dadb
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -0,0 +1,30 @@
+#include <cstdio>
+#include <cstdarg>
+
+static inline void LOG(const char* fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    vprintf(fmt, args);
+    va_end(args);
+
+    printf("\n");
+}
+
+static inline void FATAL(const char* fmt, ...) {
+  printf("FATAL: ");
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+
+  if (!fmt)
+    return; // avoid the noreturn attribute
+
+  exit(1);
+}
+
+extern "C" {
+  void ggml_backend_remoting_backend_say_hello();
+}
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
new file mode 100644
index 0000000000000..ccc3b3a3aa136
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -0,0 +1,78 @@
+#include <iostream>
+#include <dlfcn.h>
+
+#include "ggml-remoting-backend.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+#include "backend-internal.h"
+
+#define UNUSED GGML_UNUSED
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+  return 0;
+}
+
+static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+  return GGML_REMOTING_BACKEND_NAME;
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+  UNUSED(reg);
+  UNUSED(device);
+
+  return NULL;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_remoting_reg_i,
+        /* .context     = */ nullptr,
+    };
+
+    LOG("%s, hello :wave:", __func__);
+
+    return &reg;
+}
+
+typedef ggml_backend_reg_t (*backend_reg_fct_t)(void);
+
+#define METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
+#define ENTRYPOINT_FCT_NAME "ggml_backend_metal_reg"
+
+extern "C" {
+  void ggml_backend_remoting_backend_say_hello() {
+    LOG("%s: hello :wave: \\o/", __func__);
+
+    void * library_handle = dlopen(METAL_LIBRARY_PATH, RTLD_LAZY);
+
+    if (!library_handle) {
+      FATAL("Cannot open library: %s\n", dlerror());
+      return;
+    }
+
+    backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(library_handle, ENTRYPOINT_FCT_NAME);
+    const char* dlsym_error = dlerror();
+    if (dlsym_error) {
+      FATAL("Cannot load symbol: %s\n", dlsym_error);
+      return;
+    }
+
+    ggml_backend_reg_t reg = entrypoint_fct();
+    LOG("%s: --> %s", __func__, reg->iface.get_name(reg));
+
+    dlclose(library_handle);
+  }
+}

From 022ddceaf7ded722b6c2d3afadf3163abbcb9ff8 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 15:07:22 +0200
Subject: [PATCH 024/117] remoting: start using shared header files

---
 .../src/ggml-remotingbackend/shared/api_remoting.h | 13 +++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp         |  8 ++++----
 ggml/src/ggml-remotingfrontend/virtgpu.h           | 14 ++------------
 3 files changed, 19 insertions(+), 16 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/shared/api_remoting.h

diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
new file mode 100644
index 0000000000000..0cac78cccdfda
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
@@ -0,0 +1,13 @@
+
+#define VIRGL_VK_COMMAND_TYPE_LoadLibrary 255
+#define VIRGL_VK_COMMAND_TYPE_Forward 256
+
+
+static inline const char *api_remoting_command_name(int32_t type)
+{
+  switch (type) {
+  case VIRGL_VK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
+  case VIRGL_VK_COMMAND_TYPE_Forward: return "Forward";
+  default: return "unknown";
+  }
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index e251b29577616..bc20c90cb36c2 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -71,8 +71,8 @@ create_virtgpu() {
     assert(false);
   }
 
-  remote_call(gpu, PK_COMMAND_TYPE_LoadLibrary, 0);
-  remote_call(gpu, PK_COMMAND_TYPE_SayHello, 0);
+  remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0);
+  remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 12346);
 
   thks_bye();
 }
@@ -386,7 +386,7 @@ static int remote_call(
   vn_encode_uint32_t(encoder, &reply_res_id);
 
   printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__,
-	 command_name(cmd_type),
+	 api_remoting_command_name(cmd_type),
 	 cmd_flags, reply_res_id);
 
   /*
@@ -446,7 +446,7 @@ static int remote_call(
   vn_decode_int32_t(dec, &rmt_call_ret);
 
   printf("%s: call %s() --> %d\n", __func__,
-	 command_name(cmd_type), rmt_call_ret);
+	 api_remoting_command_name(cmd_type), rmt_call_ret);
 
   return rmt_call_ret;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index f3249207d85ad..bfd0dc9c82b15 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -9,6 +9,8 @@
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/api_remoting.h"
+
 void thks_bye();
 
 #include "virtgpu-shm.h"
@@ -170,15 +172,3 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
 static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags);
-
-#define PK_COMMAND_TYPE_LoadLibrary 255
-#define PK_COMMAND_TYPE_SayHello 256
-
-static inline const char *command_name(int32_t type)
-{
-  switch (type) {
-  case PK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
-  case PK_COMMAND_TYPE_SayHello: return "SayHello";
-  default: return "unknown";
-  }
-}

From 9bde80bab7e1d19b61cce3646c1cf0bb779a2146 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 17:54:58 +0200
Subject: [PATCH 025/117] remotingbackend/CMakeLists: add header dependencies

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |   3 +
 .../shared/apir_backend.h                     |  21 ++
 .../ggml-remotingbackend/shared/venus_cs.h    | 301 ++++++++++++++++++
 3 files changed, 325 insertions(+)
 create mode 100644 ggml/src/ggml-remotingbackend/shared/apir_backend.h
 create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs.h

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index 70b8d3a1b7fef..420e283fc8359 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -5,6 +5,9 @@ message(STATUS "Enable API Remoting backend")
 
 ggml_add_backend_library(ggml-remotingbackend
                          backend.cpp
+                         shared/api_remoting.h
+                         shared/apir_backend.h
+                         shared/venus_cs.h
                          ../../include/ggml-remoting-backend.h
                         )
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
new file mode 100644
index 0000000000000..8506ffa46b759
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-remotingbackend.dylib"
+#define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize"
+#define APIR_DEINIT_FCT_NAME "apir_backend_deinit"
+#define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher"
+
+#define APIR_BACKEND_INITIALIZE_SUCCESSS 0
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
+#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
+#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
+#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+
+typedef uint32_t (*apir_backend_initialize_t)(void);
+typedef void (*apir_backend_deinit_t)(void);
+
+typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
+					    char *dec_cur, const char *dec_end,
+					    char *enc_cur, const char *enc_end,
+					    char **enc_cur_after
+  );
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
new file mode 100644
index 0000000000000..d9397c6d5d647
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -0,0 +1,301 @@
+#pragma once
+
+#define likely(x)   __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+struct vn_cs_encoder {
+  char* cur;
+  const char *start;
+  const char* end;
+};
+
+struct vn_cs_decoder {
+  const char* cur;
+  const char* end;
+};
+
+/*
+ * encode peek
+ */
+
+static inline bool
+vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
+                            size_t size,
+                            void *val,
+                            size_t val_size)
+{
+   assert(val_size <= size);
+
+   if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+      FATAL("DECODER IS FULL :/");
+      //vn_cs_decoder_set_fatal(dec);
+      memset(val, 0, val_size);
+      return false;
+   }
+
+   /* we should not rely on the compiler to optimize away memcpy... */
+   memcpy(val, dec->cur, val_size);
+   return true;
+}
+
+static inline void
+vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+   vn_cs_decoder_peek_internal(dec, size, val, val_size);
+}
+
+/*
+ * read/write
+ */
+
+static inline void
+vn_cs_decoder_read(struct vn_cs_decoder *dec,
+                   size_t size,
+                   void *val,
+                   size_t val_size)
+{
+   if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
+      dec->cur += size;
+}
+
+static inline void
+vn_cs_encoder_write(struct vn_cs_encoder *enc,
+                    size_t size,
+                    const void *val,
+                    size_t val_size)
+{
+   assert(val_size <= size);
+   assert(size <= ((size_t) (enc->end - enc->cur)));
+
+   /* we should not rely on the compiler to optimize away memcpy... */
+   memcpy(enc->cur, val, val_size);
+   enc->cur += size;
+}
+
+/*
+ * encode/decode
+ */
+
+static inline void
+vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
+{
+   assert(size % 4 == 0);
+   vn_cs_decoder_read(dec, size, data, data_size);
+}
+
+static inline void
+vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
+{
+   assert(size % 4 == 0);
+   /* TODO check if the generated code is optimal */
+   vn_cs_encoder_write(enc, size, data, data_size);
+}
+
+/*
+ * typed encode/decode
+ */
+
+/* uint64_t */
+
+static inline size_t
+vn_sizeof_uint64_t(const uint64_t *val)
+{
+    assert(sizeof(*val) == 8);
+    return 8;
+}
+
+static inline void
+vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
+{
+    vn_encode(enc, 8, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
+{
+    vn_decode(dec, 8, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 8);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}
+
+/* int32_t */
+
+static inline size_t
+vn_sizeof_int32_t(const int32_t *val)
+{
+    assert(sizeof(*val) == 4);
+    return 4;
+}
+
+static inline void
+vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
+{
+    vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
+{
+    vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 4);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}
+
+/* array size (uint64_t) */
+
+static inline size_t
+vn_sizeof_array_size(uint64_t size)
+{
+    return vn_sizeof_uint64_t(&size);
+}
+
+static inline void
+vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
+{
+    vn_encode_uint64_t(enc, &size);
+}
+
+static inline uint64_t
+vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
+{
+    uint64_t size;
+    vn_decode_uint64_t(dec, &size);
+    if (size != expected_size) {
+        FATAL("ENCODER IS FULL :/");
+        //vn_cs_decoder_set_fatal(dec);
+        size = 0;
+    }
+    return size;
+}
+
+static inline uint64_t
+vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
+{
+    uint64_t size;
+    vn_decode_uint64_t(dec, &size);
+    return size;
+}
+
+static inline uint64_t
+vn_peek_array_size(struct vn_cs_decoder *dec)
+{
+    uint64_t size;
+    vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
+    return size;
+}
+
+/* non-array pointer */
+
+static inline size_t
+vn_sizeof_simple_pointer(const void *val)
+{
+    return vn_sizeof_array_size(val ? 1 : 0);
+}
+
+static inline bool
+vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
+{
+    vn_encode_array_size(enc, val ? 1 : 0);
+    return val;
+}
+
+static inline bool
+vn_decode_simple_pointer(struct vn_cs_decoder *dec)
+{
+    return vn_decode_array_size_unchecked(dec);
+}
+
+/* uint32_t */
+
+static inline size_t
+vn_sizeof_uint32_t(const uint32_t *val)
+{
+    assert(sizeof(*val) == 4);
+    return 4;
+}
+
+static inline void
+vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
+{
+    vn_encode(enc, 4, val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
+{
+    vn_decode(dec, 4, val, sizeof(*val));
+}
+
+static inline size_t
+vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
+{
+    assert(sizeof(*val) == 4);
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    return size;
+}
+
+static inline void
+vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_encode(enc, size, val, size);
+}
+
+static inline void
+vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
+{
+    const size_t size = sizeof(*val) * count;
+    assert(size >= count);
+    vn_decode(dec, size, val, size);
+}

From b5ac3985205af7fa1dbc6648c3e1826a01019502 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 17:55:35 +0200
Subject: [PATCH 026/117] ggml-remotingbackend: add skeleton of argument
 passing

---
 .../ggml-remotingbackend/backend-internal.h   | 30 +++++--
 ggml/src/ggml-remotingbackend/backend.cpp     | 81 +++++++++++++++----
 2 files changed, 90 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
index 97e9605b0dadb..e6c098ed95175 100644
--- a/ggml/src/ggml-remotingbackend/backend-internal.h
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -1,13 +1,24 @@
 #include <cstdio>
 #include <cstdarg>
 
-static inline void LOG(const char* fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    vprintf(fmt, args);
-    va_end(args);
+static inline void INFO(const char* fmt, ...) {
+  printf("INFO: ");
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+}
 
-    printf("\n");
+static inline void ERROR(const char* fmt, ...) {
+  printf("ERROR: ");
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
 }
 
 static inline void FATAL(const char* fmt, ...) {
@@ -26,5 +37,10 @@ static inline void FATAL(const char* fmt, ...) {
 }
 
 extern "C" {
-  void ggml_backend_remoting_backend_say_hello();
+  uint32_t apir_backend_initialize();
+  void apir_backend_deinit(void);
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after);
 }
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index ccc3b3a3aa136..d858b033e3c9d 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -8,6 +8,8 @@
 #include "ggml-backend.h"
 
 #include "backend-internal.h"
+#include "shared/apir_backend.h"
+#include "shared/venus_cs.h"
 
 #define UNUSED GGML_UNUSED
 
@@ -42,37 +44,88 @@ ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
         /* .context     = */ nullptr,
     };
 
-    LOG("%s, hello :wave:", __func__);
+    INFO("%s, hello :wave:", __func__);
 
     return &reg;
 }
 
 typedef ggml_backend_reg_t (*backend_reg_fct_t)(void);
 
-#define METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
-#define ENTRYPOINT_FCT_NAME "ggml_backend_metal_reg"
+#define GGML_BACKEND_METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
+#define GGML_BACKEND_METAL_REG_FCT_NAME "ggml_backend_metal_reg"
+
+static void *backend_library_handle = NULL;
 
 extern "C" {
-  void ggml_backend_remoting_backend_say_hello() {
-    LOG("%s: hello :wave: \\o/", __func__);
+  void apir_backend_deinit(void) {
+    if (backend_library_handle) {
+      INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
+      dlclose(backend_library_handle);
+    }
+
+    INFO("%s: bye-bye", __func__);
+  }
 
-    void * library_handle = dlopen(METAL_LIBRARY_PATH, RTLD_LAZY);
+  uint32_t apir_backend_initialize() {
+    INFO("%s: hello :wave: \\o/", __func__);
 
-    if (!library_handle) {
-      FATAL("Cannot open library: %s\n", dlerror());
-      return;
+    backend_library_handle = dlopen(GGML_BACKEND_METAL_LIBRARY_PATH, RTLD_LAZY);
+
+    if (!backend_library_handle) {
+      ERROR("Cannot open library: %s\n", dlerror());
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
     }
 
-    backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(library_handle, ENTRYPOINT_FCT_NAME);
+    backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(backend_library_handle, GGML_BACKEND_METAL_REG_FCT_NAME);
     const char* dlsym_error = dlerror();
     if (dlsym_error) {
-      FATAL("Cannot load symbol: %s\n", dlsym_error);
-      return;
+      ERROR("Cannot load symbol: %s\n", dlsym_error);
+
+      return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
     ggml_backend_reg_t reg = entrypoint_fct();
-    LOG("%s: --> %s", __func__, reg->iface.get_name(reg));
+    INFO("%s: --> %s", __func__, reg->iface.get_name(reg));
 
-    dlclose(library_handle);
+    return APIR_BACKEND_INITIALIZE_SUCCESSS;
+  }
+
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type,
+				   char *dec_cur, const char *dec_end,
+				   char *enc_cur, const char *enc_end,
+				   char **enc_cur_after) {
+    INFO("%s: --> %d | %p | %p ", __func__, cmd_type, dec_cur, enc_cur);
+
+    struct vn_cs_encoder _enc = {
+      .cur = enc_cur,
+      .end = enc_end,
+    };
+    struct vn_cs_encoder *enc = &_enc;
+
+    struct vn_cs_decoder _dec = {
+      .cur = dec_cur,
+      .end = dec_end,
+    };
+    struct vn_cs_decoder *dec = &_dec;
+
+    int32_t arg1, arg2, arg3;
+    vn_decode_int32_t(dec, &arg1);
+    vn_decode_int32_t(dec, &arg2);
+    vn_decode_int32_t(dec, &arg3);
+
+    INFO("%s: ARGS %d %d %d\n", __func__, arg1, arg2, arg3);
+
+    int32_t resp1 = 1;
+    int32_t resp2 = 2;
+    int32_t resp3 = 3;
+    int32_t resp4 = 4;
+    vn_encode_int32_t(enc, &resp1);
+    vn_encode_int32_t(enc, &resp2);
+    vn_encode_int32_t(enc, &resp3);
+    vn_encode_int32_t(enc, &resp4);
+    *enc_cur_after = enc->cur;
+
+    return 0;
   }
 }

From 0cdcdd269906b17b1c881a79813e05897485935a Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 17:56:02 +0200
Subject: [PATCH 027/117] remotingfrontend: improve the typing

---
 .../src/ggml-remotingfrontend/virtgpu-types.h | 298 ------------------
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    |  85 +++--
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  80 +----
 .../src/ggml-remotingfrontend/virtgpu_venus.c | 209 ------------
 4 files changed, 65 insertions(+), 607 deletions(-)
 delete mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-types.h
 delete mode 100644 ggml/src/ggml-remotingfrontend/virtgpu_venus.c

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-types.h b/ggml/src/ggml-remotingfrontend/virtgpu-types.h
deleted file mode 100644
index b0802ad634bcb..0000000000000
--- a/ggml/src/ggml-remotingfrontend/virtgpu-types.h
+++ /dev/null
@@ -1,298 +0,0 @@
-#pragma once
-#include "virtgpu.h"
-
-struct vn_cs_encoder {
-   char* cur;
-   const char* end;
-};
-
-struct vn_cs_decoder {
-  const char* cur;
-  const char* end;
-};
-
-/*
- * encode peek
- */
-
-static inline bool
-vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
-                            size_t size,
-                            void *val,
-                            size_t val_size)
-{
-   assert(val_size <= size);
-
-   if (unlikely(size > (size_t) (dec->end - dec->cur))) {
-      FATAL("DECODER IS FULL :/");
-      //vn_cs_decoder_set_fatal(dec);
-      memset(val, 0, val_size);
-      return false;
-   }
-
-   /* we should not rely on the compiler to optimize away memcpy... */
-   memcpy(val, dec->cur, val_size);
-   return true;
-}
-
-static inline void
-vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
-                   size_t size,
-                   void *val,
-                   size_t val_size)
-{
-   vn_cs_decoder_peek_internal(dec, size, val, val_size);
-}
-
-/*
- * read/write
- */
-
-static inline void
-vn_cs_decoder_read(struct vn_cs_decoder *dec,
-                   size_t size,
-                   void *val,
-                   size_t val_size)
-{
-   if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
-      dec->cur += size;
-}
-
-static inline void
-vn_cs_encoder_write(struct vn_cs_encoder *enc,
-                    size_t size,
-                    const void *val,
-                    size_t val_size)
-{
-   assert(val_size <= size);
-   assert(size <= ((size_t) (enc->end - enc->cur)));
-
-   /* we should not rely on the compiler to optimize away memcpy... */
-   memcpy(enc->cur, val, val_size);
-   enc->cur += size;
-}
-
-/*
- * encode/decode
- */
-
-static inline void
-vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
-{
-   assert(size % 4 == 0);
-   vn_cs_decoder_read(dec, size, data, data_size);
-}
-
-static inline void
-vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
-{
-   assert(size % 4 == 0);
-   /* TODO check if the generated code is optimal */
-   vn_cs_encoder_write(enc, size, data, data_size);
-}
-
-/*
- * typed encode/decode
- */
-
-/* uint64_t */
-
-static inline size_t
-vn_sizeof_uint64_t(const uint64_t *val)
-{
-    assert(sizeof(*val) == 8);
-    return 8;
-}
-
-static inline void
-vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
-{
-    vn_encode(enc, 8, val, sizeof(*val));
-}
-
-static inline void
-vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
-{
-    vn_decode(dec, 8, val, sizeof(*val));
-}
-
-static inline size_t
-vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
-{
-    assert(sizeof(*val) == 8);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
-}
-
-static inline void
-vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
-}
-
-static inline void
-vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
-}
-
-/* int32_t */
-
-static inline size_t
-vn_sizeof_int32_t(const int32_t *val)
-{
-    assert(sizeof(*val) == 4);
-    return 4;
-}
-
-static inline void
-vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
-{
-    vn_encode(enc, 4, val, sizeof(*val));
-}
-
-static inline void
-vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
-{
-    vn_decode(dec, 4, val, sizeof(*val));
-}
-
-static inline size_t
-vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
-{
-    assert(sizeof(*val) == 4);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
-}
-
-static inline void
-vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
-}
-
-static inline void
-vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
-}
-
-/* array size (uint64_t) */
-
-static inline size_t
-vn_sizeof_array_size(uint64_t size)
-{
-    return vn_sizeof_uint64_t(&size);
-}
-
-static inline void
-vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
-{
-    vn_encode_uint64_t(enc, &size);
-}
-
-static inline uint64_t
-vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
-{
-    uint64_t size;
-    vn_decode_uint64_t(dec, &size);
-    if (size != expected_size) {
-        FATAL("ENCODER IS FULL :/");
-        //vn_cs_decoder_set_fatal(dec);
-        size = 0;
-    }
-    return size;
-}
-
-static inline uint64_t
-vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
-{
-    uint64_t size;
-    vn_decode_uint64_t(dec, &size);
-    return size;
-}
-
-static inline uint64_t
-vn_peek_array_size(struct vn_cs_decoder *dec)
-{
-    uint64_t size;
-    vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
-    return size;
-}
-
-/* non-array pointer */
-
-static inline size_t
-vn_sizeof_simple_pointer(const void *val)
-{
-    return vn_sizeof_array_size(val ? 1 : 0);
-}
-
-static inline bool
-vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
-{
-    vn_encode_array_size(enc, val ? 1 : 0);
-    return val;
-}
-
-static inline bool
-vn_decode_simple_pointer(struct vn_cs_decoder *dec)
-{
-    return vn_decode_array_size_unchecked(dec);
-}
-
-/* uint32_t */
-
-static inline size_t
-vn_sizeof_uint32_t(const uint32_t *val)
-{
-    assert(sizeof(*val) == 4);
-    return 4;
-}
-
-static inline void
-vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
-{
-    vn_encode(enc, 4, val, sizeof(*val));
-}
-
-static inline void
-vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
-{
-    vn_decode(dec, 4, val, sizeof(*val));
-}
-
-static inline size_t
-vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
-{
-    assert(sizeof(*val) == 4);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
-}
-
-static inline void
-vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
-}
-
-static inline void
-vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
-{
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
-}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index bc20c90cb36c2..a88d07c8198fd 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -6,7 +6,6 @@
 #include <cstdlib>
 
 #include "virtgpu.h"
-#include "virtgpu-types.h"
 
 static inline void
 virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
@@ -37,7 +36,7 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
 void *something = NULL;
 void thks_bye () {
   // break here
-  INFO("thks bye, stopping early.");
+  INFO("thks bye, stopping early and happilly :)");
   if (!something) { // avoid the [[noreturn]] detection mechanism
     exit(0);
   }
@@ -50,17 +49,17 @@ create_virtgpu() {
   util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem),
 			 1024);
 
-  VkResult result = virtgpu_open(gpu);
-  assert(result == VK_SUCCESS);
+  virt_gpu_result_t result = virtgpu_open(gpu);
+  assert(result == APIR_SUCCESS);
 
   result = virtgpu_init_params(gpu);
-  assert(result == VK_SUCCESS);
+  assert(result == APIR_SUCCESS);
 
   result = virtgpu_init_capset(gpu);
-  assert(result == VK_SUCCESS);
+  assert(result == APIR_SUCCESS);
 
   result = virtgpu_init_context(gpu);
-  assert(result == VK_SUCCESS);
+  assert(result == APIR_SUCCESS);
 
   virtgpu_init_shmem_blob_mem(gpu);
 
@@ -71,26 +70,33 @@ create_virtgpu() {
     assert(false);
   }
 
-  remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0);
-  remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 12346);
-
+  uint32_t ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0, 0, 0, 0);
+  if (ret != 0) {
+    FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret);
+    assert(false);
+  }
+  ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 0, 111, 555, 999);
+  if (ret != 0) {
+    FATAL("%s: failed to forard the API call (code=%d):/", __func__, ret);
+    assert(false);
+  }
   thks_bye();
 }
 
-static VkResult
+static virt_gpu_result_t
 virtgpu_open(struct virtgpu *gpu)
 {
    drmDevicePtr devs[8];
    int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs));
    if (count < 0) {
      INFO("failed to enumerate DRM devices");
-     return VK_ERROR_INITIALIZATION_FAILED;
+     return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
-   VkResult result = VK_ERROR_INITIALIZATION_FAILED;
+   virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED;
    for (int i = 0; i < count; i++) {
       result = virtgpu_open_device(gpu, devs[i]);
-      if (result == VK_SUCCESS)
+      if (result == APIR_SUCCESS)
          break;
    }
 
@@ -99,7 +105,7 @@ virtgpu_open(struct virtgpu *gpu)
    return result;
 }
 
-static VkResult
+static virt_gpu_result_t
 virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
 {
    bool supported_bus = false;
@@ -128,7 +134,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
          }
          vn_log(gpu->instance, "skipping DRM device %s", name);
       }
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
    const char *primary_path = dev->nodes[DRM_NODE_PRIMARY];
@@ -138,7 +144,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
    if (fd < 0) {
       if (VN_DEBUG(INIT))
          vn_log(gpu->instance, "failed to open %s", node_path);
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
    drmVersionPtr version = drmGetVersion(fd);
@@ -155,7 +161,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
       if (version)
          drmFreeVersion(version);
       close(fd);
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
    gpu->fd = fd;
@@ -183,7 +189,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
    if (VN_DEBUG(INIT))
       vn_log(gpu->instance, "using DRM device %s", node_path);
 
-   return VK_SUCCESS;
+   return APIR_SUCCESS;
 }
 
 void
@@ -202,9 +208,7 @@ vn_log(struct remoting_dev_instance *instance, const char *format, ...)
    /* instance may be NULL or partially initialized */
 }
 
-
-
-static VkResult
+static virt_gpu_result_t
 virtgpu_init_context(struct virtgpu *gpu)
 {
    assert(!gpu->capset.version);
@@ -214,13 +218,13 @@ virtgpu_init_context(struct virtgpu *gpu)
          vn_log(gpu->instance, "failed to initialize context: %s",
                 strerror(errno));
       }
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
-   return VK_SUCCESS;
+   return APIR_SUCCESS;
 }
 
-static VkResult
+static virt_gpu_result_t
 virtgpu_init_capset(struct virtgpu *gpu)
 {
    gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS;
@@ -234,13 +238,13 @@ virtgpu_init_capset(struct virtgpu *gpu)
          vn_log(gpu->instance, "failed to get venus v%d capset: %s",
                 gpu->capset.version, strerror(errno));
       }
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
-   return VK_SUCCESS;
+   return APIR_SUCCESS;
 }
 
-static VkResult
+static virt_gpu_result_t
 virtgpu_init_params(struct virtgpu *gpu)
 {
    const uint64_t required_params[] = {
@@ -255,7 +259,7 @@ virtgpu_init_params(struct virtgpu *gpu)
             vn_log(gpu->instance, "required kernel param %d is missing",
                    (int)required_params[i]);
          }
-         return VK_ERROR_INITIALIZATION_FAILED;
+         return APIR_ERROR_INITIALIZATION_FAILED;
       }
    }
 
@@ -273,7 +277,7 @@ virtgpu_init_params(struct virtgpu *gpu)
       vn_log(gpu->instance,
              "one of required kernel params (%d or %d) is missing",
              (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM);
-      return VK_ERROR_INITIALIZATION_FAILED;
+      return APIR_ERROR_INITIALIZATION_FAILED;
    }
 
    /* Cross-device feature is optional.  It enables sharing dma-bufs
@@ -287,7 +291,7 @@ virtgpu_init_params(struct virtgpu *gpu)
    /* implied by CONTEXT_INIT uapi */
    gpu->max_timeline_count = 64;
 
-   return VK_SUCCESS;
+   return APIR_SUCCESS;
 }
 
 static int
@@ -354,7 +358,8 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
 static int remote_call(
   struct virtgpu *gpu,
   int32_t cmd_type,
-  int32_t cmd_flags
+  int32_t cmd_flags,
+  int32_t arg1, int32_t arg2, int32_t arg3
   )
 {
 
@@ -374,7 +379,6 @@ static int remote_call(
    * Fill the command encoder buffer
    */
 
-  /* VkCommandTypeEXT is int32_t */
   vn_encode_int32_t(encoder, &cmd_type);
   vn_encode_int32_t(encoder, &cmd_flags);
 
@@ -389,6 +393,10 @@ static int remote_call(
 	 api_remoting_command_name(cmd_type),
 	 cmd_flags, reply_res_id);
 
+  vn_encode_int32_t(encoder, &arg1);
+  vn_encode_int32_t(encoder, &arg2);
+  vn_encode_int32_t(encoder, &arg3);
+
   /*
    * Reply notification pointer
    */
@@ -442,9 +450,20 @@ static int remote_call(
   };
   struct vn_cs_decoder *dec = &_dec;
 
+  int32_t resp1;
+  int32_t resp2;
+  int32_t resp3;
+  int32_t resp4;
+  vn_decode_int32_t(dec, &resp1);
+  vn_decode_int32_t(dec, &resp2);
+  vn_decode_int32_t(dec, &resp3);
+  vn_decode_int32_t(dec, &resp4);
+
   int32_t rmt_call_ret;
   vn_decode_int32_t(dec, &rmt_call_ret);
 
+  printf("%s: RESP %d %d %d %d\n", __func__, resp1, resp2, resp3, resp4);
+
   printf("%s: call %s() --> %d\n", __func__,
 	 api_remoting_command_name(cmd_type), rmt_call_ret);
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index bfd0dc9c82b15..379a2174fc3db 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -9,12 +9,13 @@
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/api_remoting.h"
+#include "virtgpu-utils.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h"
 
 void thks_bye();
 
 #include "virtgpu-shm.h"
-#include "virtgpu-utils.h"
 
 #define VIRGL_RENDERER_UNSTABLE_APIS 1
 #include "drm-uapi/virtgpu_drm.h"
@@ -31,65 +32,10 @@ void thks_bye();
 
 #define VN_DEBUG(what) true
 
-typedef enum VkResult {
-    VK_SUCCESS = 0,
-    VK_NOT_READY = 1,
-    VK_TIMEOUT = 2,
-    VK_EVENT_SET = 3,
-    VK_EVENT_RESET = 4,
-    VK_INCOMPLETE = 5,
-    VK_ERROR_OUT_OF_HOST_MEMORY = -1,
-    VK_ERROR_OUT_OF_DEVICE_MEMORY = -2,
-    VK_ERROR_INITIALIZATION_FAILED = -3,
-    VK_ERROR_DEVICE_LOST = -4,
-    VK_ERROR_MEMORY_MAP_FAILED = -5,
-    VK_ERROR_LAYER_NOT_PRESENT = -6,
-    VK_ERROR_EXTENSION_NOT_PRESENT = -7,
-    VK_ERROR_FEATURE_NOT_PRESENT = -8,
-    VK_ERROR_INCOMPATIBLE_DRIVER = -9,
-    VK_ERROR_TOO_MANY_OBJECTS = -10,
-    VK_ERROR_FORMAT_NOT_SUPPORTED = -11,
-    VK_ERROR_FRAGMENTED_POOL = -12,
-    VK_ERROR_UNKNOWN = -13,
-    VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000,
-    VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003,
-    VK_ERROR_FRAGMENTATION = -1000161000,
-    VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000,
-    VK_PIPELINE_COMPILE_REQUIRED = 1000297000,
-    VK_ERROR_SURFACE_LOST_KHR = -1000000000,
-    VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001,
-    VK_SUBOPTIMAL_KHR = 1000001003,
-    VK_ERROR_OUT_OF_DATE_KHR = -1000001004,
-    VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001,
-    VK_ERROR_VALIDATION_FAILED_EXT = -1000011001,
-    VK_ERROR_INVALID_SHADER_NV = -1000012000,
-    VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR = -1000023000,
-    VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR = -1000023001,
-    VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR = -1000023002,
-    VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR = -1000023003,
-    VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR = -1000023004,
-    VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR = -1000023005,
-    VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158000,
-    VK_ERROR_NOT_PERMITTED_KHR = -1000174001,
-    VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000,
-    VK_THREAD_IDLE_KHR = 1000268000,
-    VK_THREAD_DONE_KHR = 1000268001,
-    VK_OPERATION_DEFERRED_KHR = 1000268002,
-    VK_OPERATION_NOT_DEFERRED_KHR = 1000268003,
-    VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR = -1000299000,
-    VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000,
-    VK_INCOMPATIBLE_SHADER_BINARY_EXT = 1000482000,
-    VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY,
-    VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE,
-    VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION,
-    VK_ERROR_NOT_PERMITTED_EXT = VK_ERROR_NOT_PERMITTED_KHR,
-    VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
-    VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS_KHR = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS,
-    VK_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED,
-    VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED,
-    VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT = VK_INCOMPATIBLE_SHADER_BINARY_EXT,
-    VK_RESULT_MAX_ENUM = 0x7FFFFFFF
-} VkResult;
+typedef enum virt_gpu_result_t {
+    APIR_SUCCESS = 0,
+    APIR_ERROR_INITIALIZATION_FAILED = -1,
+} virt_gpu_result_t;
 
 
 struct remoting_dev_instance {
@@ -153,13 +99,13 @@ virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
 }
 
 void create_virtgpu();
-static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
-static VkResult virtgpu_open(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu);
 
 
-static VkResult virtgpu_init_params(struct virtgpu *gpu);
-static VkResult virtgpu_init_capset(struct virtgpu *gpu);
-static VkResult virtgpu_init_context(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu);
 
 static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
 				      enum virgl_renderer_capset capset_id);
@@ -171,4 +117,4 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
                        size_t capset_size);
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
-static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags);
+static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags, int32_t arg1, int32_t arg2, int32_t arg3);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c
deleted file mode 100644
index fc401c13d3003..0000000000000
--- a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c
+++ /dev/null
@@ -1,209 +0,0 @@
-static inline void vn_encode_vkEnumeratePhysicalDevices(struct vn_cs_encoder *enc, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices)
-{
-    const VkCommandTypeEXT cmd_type = VK_COMMAND_TYPE_vkEnumeratePhysicalDevices_EXT;
-
-    vn_encode_VkCommandTypeEXT(enc, &cmd_type);
-    vn_encode_VkFlags(enc, &cmd_flags);
-
-    vn_encode_VkInstance(enc, &instance);
-    if (vn_encode_simple_pointer(enc, pPhysicalDeviceCount))
-        vn_encode_uint32_t(enc, pPhysicalDeviceCount);
-    if (pPhysicalDevices) {
-        vn_encode_array_size(enc, (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0));
-        for (uint32_t i = 0; i < (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0); i++)
-            vn_encode_VkPhysicalDevice(enc, &pPhysicalDevices[i]);
-    } else {
-        vn_encode_array_size(enc, 0);
-    }
-}
-
-static inline struct vn_cs_encoder *
-vn_ring_submit_command_init(struct vn_ring *ring,
-                            struct vn_ring_submit_command *submit,
-                            void *cmd_data,
-                            size_t cmd_size,
-                            size_t reply_size)
-{
-   submit->buffer = VN_CS_ENCODER_BUFFER_INITIALIZER(cmd_data);
-   submit->command = VN_CS_ENCODER_INITIALIZER(&submit->buffer, cmd_size);
-
-   submit->reply_size = reply_size;
-   submit->reply_shmem = NULL;
-
-   submit->ring_seqno_valid = false;
-
-   return &submit->command;
-}
-
-static inline void vn_submit_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices, struct vn_ring_submit_command *submit)
-{
-    uint8_t local_cmd_data[VN_SUBMIT_LOCAL_CMD_SIZE];
-    void *cmd_data = local_cmd_data;
-    size_t cmd_size = vn_sizeof_vkEnumeratePhysicalDevices(instance, pPhysicalDeviceCount, pPhysicalDevices);
-    if (cmd_size > sizeof(local_cmd_data)) {
-        cmd_data = malloc(cmd_size);
-        if (!cmd_data)
-            cmd_size = 0;
-    }
-    const size_t reply_size = cmd_flags & VK_COMMAND_GENERATE_REPLY_BIT_EXT ? vn_sizeof_vkEnumeratePhysicalDevices_reply(instance, pPhysicalDeviceCount, pPhysicalDevices) : 0;
-
-    struct vn_cs_encoder *enc = vn_ring_submit_command_init(vn_ring, submit, cmd_data, cmd_size, reply_size);
-    if (cmd_size) {
-        vn_encode_vkEnumeratePhysicalDevices(enc, cmd_flags, instance, pPhysicalDeviceCount, pPhysicalDevices);
-        vn_ring_submit_command(vn_ring, submit);
-        if (cmd_data != local_cmd_data)
-            free(cmd_data);
-    }
-}
-
-VkResult vn_call_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices)
-{
-    VN_TRACE_FUNC();
-
-    struct vn_ring_submit_command submit;
-    vn_submit_vkEnumeratePhysicalDevices(vn_ring, VK_COMMAND_GENERATE_REPLY_BIT_EXT, instance, pPhysicalDeviceCount, pPhysicalDevices, &submit);
-    struct vn_cs_decoder *dec = vn_ring_get_command_reply(vn_ring, &submit);
-    if (dec) {
-        const VkResult ret = vn_decode_vkEnumeratePhysicalDevices_reply(dec, instance, pPhysicalDeviceCount, pPhysicalDevices);
-        vn_ring_free_command_reply(vn_ring, &submit);
-        return ret;
-    } else {
-        return VK_ERROR_OUT_OF_HOST_MEMORY;
-    }
-}
-
-VkResult
-vn_ring_submit_command_simple(struct vn_ring *ring,
-                              const struct vn_cs_encoder *cs)
-{
-   mtx_lock(&ring->mutex);
-   VkResult result = vn_ring_submit_locked(ring, cs, NULL, NULL);
-   mtx_unlock(&ring->mutex);
-
-   return result;
-}
-
-static VkResult
-vn_ring_submit_locked(struct vn_ring *ring,
-                      const struct vn_cs_encoder *cs,
-                      struct vn_renderer_shmem *extra_shmem,
-                      uint32_t *ring_seqno)
-{
-   const bool direct = vn_ring_submission_can_direct(ring, cs);
-   if (!direct && cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER) {
-      cs = vn_ring_cs_upload_locked(ring, cs);
-      if (!cs)
-         return VK_ERROR_OUT_OF_HOST_MEMORY;
-      assert(cs->storage_type != VN_CS_ENCODER_STORAGE_POINTER);
-   }
-
-   struct vn_ring_submission submit;
-   VkResult result =
-      vn_ring_submission_prepare(ring, &submit, cs, extra_shmem, direct);
-   if (result != VK_SUCCESS)
-      return result;
-
-   uint32_t seqno;
-   const bool notify =
-      vn_ring_submit_internal(ring, submit.submit, submit.cs, &seqno);
-   if (notify) {
-      uint32_t notify_ring_data[8];
-      struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL(
-         notify_ring_data, sizeof(notify_ring_data));
-      vn_encode_vkNotifyRingMESA(&local_enc, 0, ring->id, seqno, 0);
-      vn_renderer_submit_simple(ring->instance->renderer, notify_ring_data,
-                                vn_cs_encoder_get_len(&local_enc));
-   }
-
-   vn_ring_submission_cleanup(&submit);
-
-   if (ring_seqno)
-      *ring_seqno = seqno;
-
-   return VK_SUCCESS;
-}
-
-static VkResult
-vn_ring_submission_prepare(struct vn_ring *ring,
-                           struct vn_ring_submission *submit,
-                           const struct vn_cs_encoder *cs,
-                           struct vn_renderer_shmem *extra_shmem,
-                           bool direct)
-{
-   submit->cs = vn_ring_submission_get_cs(submit, cs, direct);
-   if (!submit->cs)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   submit->submit =
-      vn_ring_submission_get_ring_submit(ring, cs, extra_shmem, direct);
-   if (!submit->submit) {
-      vn_ring_submission_cleanup(submit);
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-   }
-
-   return VK_SUCCESS;
-}
-
-static bool
-vn_ring_submit_internal(struct vn_ring *ring,
-                        struct vn_ring_submit *submit,
-                        const struct vn_cs_encoder *cs,
-                        uint32_t *seqno)
-{
-   /* write cs to the ring */
-   assert(!vn_cs_encoder_is_empty(cs));
-
-   /* avoid -Wmaybe-unitialized */
-   uint32_t cur_seqno = 0;
-
-   for (uint32_t i = 0; i < cs->buffer_count; i++) {
-      const struct vn_cs_encoder_buffer *buf = &cs->buffers[i];
-      cur_seqno = vn_ring_wait_space(ring, buf->committed_size);
-      vn_ring_write_buffer(ring, buf->base, buf->committed_size);
-   }
-
-   vn_ring_store_tail(ring);
-   const VkRingStatusFlagsMESA status = vn_ring_load_status(ring);
-   if (status & VK_RING_STATUS_FATAL_BIT_MESA) {
-      vn_log(NULL, "vn_ring_submit abort on fatal");
-      abort();
-   }
-
-   vn_ring_retire_submits(ring, cur_seqno);
-
-   submit->seqno = ring->cur;
-   list_addtail(&submit->head, &ring->submits);
-
-   *seqno = submit->seqno;
-
-   /* Notify renderer to wake up idle ring if at least VN_RING_IDLE_TIMEOUT_NS
-    * has passed since the last sent notification to avoid excessive wake up
-    * calls (non-trivial since submitted via virtio-gpu kernel).
-    */
-   if (status & VK_RING_STATUS_IDLE_BIT_MESA) {
-      const int64_t now = os_time_get_nano();
-      if (os_time_timeout(ring->last_notify, ring->next_notify, now)) {
-         ring->last_notify = now;
-         ring->next_notify = now + VN_RING_IDLE_TIMEOUT_NS;
-         return true;
-      }
-   }
-   return false;
-}
-
-static void
-vn_ring_write_buffer(struct vn_ring *ring, const void *data, uint32_t size)
-{
-   assert(ring->cur + size - vn_ring_load_head(ring) <= ring->buffer_size);
-
-   const uint32_t offset = ring->cur & ring->buffer_mask;
-   if (offset + size <= ring->buffer_size) {
-      memcpy(ring->shared.buffer + offset, data, size);
-   } else {
-      const uint32_t s = ring->buffer_size - offset;
-      memcpy(ring->shared.buffer + offset, data, s);
-      memcpy(ring->shared.buffer, data + s, size - s);
-   }
-
-   ring->cur += size;
-}

From f15fedf17ca16e4863508298794e1ce87e83a123 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 6 May 2025 17:56:17 +0200
Subject: [PATCH 028/117] podman_compile: delete the pod before compiling

---
 podman_compile.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/podman_compile.sh b/podman_compile.sh
index 47e4baee07037..4793b4ce20fa2 100755
--- a/podman_compile.sh
+++ b/podman_compile.sh
@@ -19,9 +19,12 @@ fi
 
 cmd="bash ./build.$what.sh"
 
+POD_NAME=mac_ai_compiling
+podman machine ssh podman rm $POD_NAME --force
+
 set -x
 podman run \
---name mac_ai_compiling \
+--name $POD_NAME \
 --user root:root \
 --cgroupns host \
 --security-opt label=disable \

From 0c264b180241db33a3555babdde02d8cd6f73e3e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 10:27:03 +0200
Subject: [PATCH 029/117] virtgpu-utils: add WARNING

---
 ggml/src/ggml-remotingfrontend/virtgpu-utils.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index 9d1589c9128ab..7da90be25c380 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -33,6 +33,17 @@ INFO(const char *format, ...) {
   va_end(argptr);
 }
 
+inline void
+WARNING(const char *format, ...) {
+  fprintf(stderr, "WARNING: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
 inline void
 FATAL(const char *format, ...) {
   fprintf(stderr, "FATAL: ");
@@ -42,7 +53,7 @@ FATAL(const char *format, ...) {
   vfprintf(stderr, format, argptr);
   fprintf(stderr, "\n");
   va_end(argptr);
-  exit(1);
+  assert(false);
 }
 
 static inline bool

From 938ba6b7a51ed844b158b48b8a2a13601ae02a8f Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 10:27:37 +0200
Subject: [PATCH 030/117] virtgpu: split the remote call into
 prepare/call/finish

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 149 +++++++++++++--------
 ggml/src/ggml-remotingfrontend/virtgpu.h   |   8 +-
 2 files changed, 102 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index a88d07c8198fd..db484f04e9d6a 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -67,19 +67,59 @@ create_virtgpu() {
 
   if (!gpu->reply_shmem) {
     FATAL("%s: failed to create the reply shared memory page :/", __func__);
-    assert(false);
   }
 
-  uint32_t ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0, 0, 0, 0);
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  int32_t ret;
+
+  encoder = remote_call_prepare(gpu,  VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+  decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  ret = remote_call_finish(encoder, decoder);
   if (ret != 0) {
     FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret);
-    assert(false);
   }
-  ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 0, 111, 555, 999);
+
+  int32_t forward_flag = 0;
+  encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  int32_t arg1 = 11;
+  int32_t arg2 = 22;
+  int32_t arg3 = 33;
+
+  vn_encode_int32_t(encoder, &arg1);
+  vn_encode_int32_t(encoder, &arg2);
+  vn_encode_int32_t(encoder, &arg3);
+  decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  int32_t resp1;
+  int32_t resp2;
+  int32_t resp3;
+  int32_t resp4;
+  vn_decode_int32_t(decoder, &resp1);
+  vn_decode_int32_t(decoder, &resp2);
+  vn_decode_int32_t(decoder, &resp3);
+  vn_decode_int32_t(decoder, &resp4);
+  INFO("%s: Forward RESP %d %d %d %d", __func__, resp1, resp2, resp3, resp4);
+
+  ret = remote_call_finish(encoder, decoder);
   if (ret != 0) {
-    FATAL("%s: failed to forard the API call (code=%d):/", __func__, ret);
-    assert(false);
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
   }
+
   thks_bye();
 }
 
@@ -355,50 +395,71 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
 }
 
 
-static int remote_call(
+static struct vn_cs_encoder *remote_call_prepare(
   struct virtgpu *gpu,
   int32_t cmd_type,
-  int32_t cmd_flags,
-  int32_t arg1, int32_t arg2, int32_t arg3
-  )
+  int32_t cmd_flags)
 {
 
+  if (!gpu->reply_shmem) {
+    FATAL("%s: the reply shmem page can't be null", __func__);
+  }
+
   /*
-   * Prepare the command encoder buffer
+   * Prepare the command encoder and its buffer
    */
 
-  char encoder_buffer[4096];
+  static char encoder_buffer[4096];
 
-  struct vn_cs_encoder _encoder = {
+  static struct vn_cs_encoder enc;
+  enc = {
+    encoder_buffer,
     encoder_buffer,
     encoder_buffer + sizeof(encoder_buffer),
   };
-  struct vn_cs_encoder *encoder = &_encoder;
 
   /*
-   * Fill the command encoder buffer
+   * Fill the command encoder with the common args:
+   * - cmd_type (int32_t)
+   * - cmd_flags (int32_t)
+   * - reply res id (uint32_t)
    */
 
-  vn_encode_int32_t(encoder, &cmd_type);
-  vn_encode_int32_t(encoder, &cmd_flags);
-
-  if (!gpu->reply_shmem) {
-    FATAL("%s: the reply shmem page can't be null", __func__);
-  }
+  vn_encode_int32_t(&enc, &cmd_type);
+  vn_encode_int32_t(&enc, &cmd_flags);
 
   uint32_t reply_res_id = gpu->reply_shmem->res_id;
-  vn_encode_uint32_t(encoder, &reply_res_id);
+  vn_encode_uint32_t(&enc, &reply_res_id);
 
-  printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__,
+  printf("%s: prepare %s(flags=0x%x, reply_buf=%d)\n", __func__,
 	 api_remoting_command_name(cmd_type),
 	 cmd_flags, reply_res_id);
 
-  vn_encode_int32_t(encoder, &arg1);
-  vn_encode_int32_t(encoder, &arg2);
-  vn_encode_int32_t(encoder, &arg3);
+  return &enc;
+}
+
+static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  if (!enc) {
+    WARNING("Invalid (null) encoder :/");
+  }
+  if (!dec) {
+    FATAL("Invalid (null) decoder :/");
+  }
+  int32_t remote_call_ret;
+  vn_decode_int32_t(dec, &remote_call_ret);
+
+  // encoder and decoder are statically allocated, nothing to do to release them
+
+  return remote_call_ret;
+}
 
+static struct vn_cs_decoder *remote_call(
+  struct virtgpu *gpu,
+  struct vn_cs_encoder *encoder
+  )
+{
   /*
-   * Reply notification pointer
+   * Prepare the reply notification pointer
    */
 
   volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr;
@@ -410,8 +471,8 @@ static int remote_call(
 
   struct drm_virtgpu_execbuffer args = {
     .flags = VIRTGPU_EXECBUF_RING_IDX,
-    .size = sizeof(encoder_buffer),
-    .command = (uintptr_t) encoder_buffer,
+    .size = (uint32_t) (encoder->cur - encoder->start),
+    .command = (uintptr_t) encoder->start,
 
     .bo_handles = 0,
     .num_bo_handles = 0,
@@ -441,31 +502,11 @@ static int remote_call(
   }
 
   /*
-   * Read the reply
+   * Prepare the decoder
    */
+  static struct vn_cs_decoder dec;
+  dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif);
+  dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size;
 
-  struct vn_cs_decoder _dec = {
-    .cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif),
-    .end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size,
-  };
-  struct vn_cs_decoder *dec = &_dec;
-
-  int32_t resp1;
-  int32_t resp2;
-  int32_t resp3;
-  int32_t resp4;
-  vn_decode_int32_t(dec, &resp1);
-  vn_decode_int32_t(dec, &resp2);
-  vn_decode_int32_t(dec, &resp3);
-  vn_decode_int32_t(dec, &resp4);
-
-  int32_t rmt_call_ret;
-  vn_decode_int32_t(dec, &rmt_call_ret);
-
-  printf("%s: RESP %d %d %d %d\n", __func__, resp1, resp2, resp3, resp4);
-
-  printf("%s: call %s() --> %d\n", __func__,
-	 api_remoting_command_name(cmd_type), rmt_call_ret);
-
-  return rmt_call_ret;
+  return &dec;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 379a2174fc3db..faef2a02bc7d8 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -117,4 +117,10 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu,
                        size_t capset_size);
 static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
 static void virtgpu_init_renderer_info(struct virtgpu *gpu);
-static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags, int32_t arg1, int32_t arg2, int32_t arg3);
+
+static struct vn_cs_encoder *remote_call_prepare(
+  struct virtgpu *gpu,
+  int32_t cmd_type,
+  int32_t cmd_flags);
+static struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc);
+static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);

From 0582bab359865d99119c3fee0237cac820dc7d60 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 10:34:24 +0200
Subject: [PATCH 031/117] ggml-backend-reg: reindent

---
 .../ggml-backend-reg.cpp                      | 79 ++++++++++---------
 .../src/ggml-remotingfrontend/ggml-remoting.h | 35 ++++----
 2 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index cb77a31a037c8..93f35f7e2e26e 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -4,66 +4,67 @@
 #include "ggml-remoting.h"
 
 static int ggml_backend_remoting_get_device_count() {
-    return 1;
+
+  return 1;
 }
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
-    UNUSED(reg);
-    return ggml_backend_remoting_get_device_count();
+  UNUSED(reg);
+  return ggml_backend_remoting_get_device_count();
 }
 
 static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-    static std::vector<ggml_backend_dev_t> devices;
+  static std::vector<ggml_backend_dev_t> devices;
 
-    static bool initialized = false;
+  static bool initialized = false;
 
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
+  {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!initialized) {
 
-	    create_virtgpu();
+      create_virtgpu();
 
-            for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
-                ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
-                char desc[256] = "API Remoting device";
+      for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
+        ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
+        char desc[256] = "API Remoting device";
 
-                ctx->device = i;
-                ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
-                ctx->description = desc;
-                devices.push_back(new ggml_backend_device {
-                    /* .iface   = */ ggml_backend_remoting_device_i,
-                    /* .reg     = */ reg,
-                    /* .context = */ ctx,
-                });
-            }
-            initialized = true;
-        }
+        ctx->device = i;
+        ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
+        ctx->description = desc;
+        devices.push_back(new ggml_backend_device {
+            /* .iface   = */ ggml_backend_remoting_device_i,
+            /* .reg     = */ reg,
+            /* .context = */ ctx,
+          });
+      }
+      initialized = true;
     }
+  }
 
-    GGML_ASSERT(device < devices.size());
-    return devices[device];
+  GGML_ASSERT(device < devices.size());
+  return devices[device];
 }
 
 static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
-    UNUSED(reg);
-    return GGML_REMOTING_FRONTEND_NAME;
+  UNUSED(reg);
+  return GGML_REMOTING_FRONTEND_NAME;
 }
 
 static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
-    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
-    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
-    /* .get_proc_address = */ NULL,
+  /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+  /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+  /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+  /* .get_proc_address = */ NULL,
 };
 
 ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
-    static ggml_backend_reg reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_remoting_reg_i,
-        /* .context     = */ nullptr,
-    };
+  static ggml_backend_reg reg = {
+    /* .api_version = */ GGML_BACKEND_API_VERSION,
+    /* .iface       = */ ggml_backend_remoting_reg_i,
+    /* .context     = */ nullptr,
+  };
 
-    RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
-    return &reg;
+  RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
+  return &reg;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index c6acdf6cfe1c8..7dcc6641f7574 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -15,9 +15,10 @@
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
 struct ggml_backend_remoting_device_context {
-    size_t device;
-    std::string name;
-    std::string description;
+  size_t device;
+  std::string name;
+  std::string description;
+
 };
 
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_i;
@@ -38,24 +39,24 @@ typedef std::shared_ptr<remoting_device_struct> remoting_device;
 typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
 
 struct ggml_backend_remoting_buffer_context {
-    remoting_device_ref device;
-    remoting_buffer dev_buffer;
-    std::string name;
-
-    ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
-        name(name) {
-        UNUSED(device);
-	UNUSED(dev_buffer);
-    }
-
-    ~ggml_backend_remoting_buffer_context() {
-        ggml_remoting_destroy_buffer(dev_buffer);
-    }
+  remoting_device_ref device;
+  remoting_buffer dev_buffer;
+  std::string name;
+
+  ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
+    name(name) {
+    UNUSED(device);
+    UNUSED(dev_buffer);
+  }
+
+  ~ggml_backend_remoting_buffer_context() {
+    ggml_remoting_destroy_buffer(dev_buffer);
+  }
 };
 
 
 struct remoting_context_struct {
-   int i;
+  int i;
 };
 typedef std::shared_ptr<remoting_context_struct> remoting_context;
 typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;

From be5f5e0e1042087d0a67a9c998bf9fe250f308d9 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 10:34:44 +0200
Subject: [PATCH 032/117] move thks_bye() to virtgpu-utils

---
 ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp | 11 +++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp       |  9 ---------
 ggml/src/ggml-remotingfrontend/virtgpu.h         |  2 --
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
index 100f495add1bc..f1af0d3391550 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -8,6 +8,8 @@
 #define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
 #define NULL_NODE 0
 
+void thks_bye();
+
 #define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
 #define os_free_aligned(_ptr) free(_ptr)
 #define p_atomic_cmpxchg(v, old, _new) \
@@ -184,3 +186,12 @@ util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx)
    uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1);
    return (void *)((char *)node_data + (elem_idx * arr->elem_size));
 }
+
+void *something = NULL;
+void thks_bye () {
+  // break here
+  INFO("thks bye, stopping early and happilly :)");
+  if (!something) { // avoid the [[noreturn]] detection mechanism
+    exit(0);
+  }
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index db484f04e9d6a..fbffbb361f016 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -33,15 +33,6 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
    gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
 }
 
-void *something = NULL;
-void thks_bye () {
-  // break here
-  INFO("thks bye, stopping early and happilly :)");
-  if (!something) { // avoid the [[noreturn]] detection mechanism
-    exit(0);
-  }
-}
-
 void
 create_virtgpu() {
   struct virtgpu *gpu = new struct virtgpu();
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index faef2a02bc7d8..f252e98ffd3af 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -13,8 +13,6 @@
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h"
 
-void thks_bye();
-
 #include "virtgpu-shm.h"
 
 #define VIRGL_RENDERER_UNSTABLE_APIS 1

From 60bac85a3eb36b28421be80fd31581a9456e8341 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 10:35:06 +0200
Subject: [PATCH 033/117] virtgpu: remove forward call wip code

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 36 +---------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index fbffbb361f016..f7de8a5b66b4e 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -77,43 +77,9 @@ create_virtgpu() {
   if (ret != 0) {
     FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret);
   }
-
-  int32_t forward_flag = 0;
-  encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
-
-  int32_t arg1 = 11;
-  int32_t arg2 = 22;
-  int32_t arg3 = 33;
-
-  vn_encode_int32_t(encoder, &arg1);
-  vn_encode_int32_t(encoder, &arg2);
-  vn_encode_int32_t(encoder, &arg3);
-  decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
-
-  int32_t resp1;
-  int32_t resp2;
-  int32_t resp3;
-  int32_t resp4;
-  vn_decode_int32_t(decoder, &resp1);
-  vn_decode_int32_t(decoder, &resp2);
-  vn_decode_int32_t(decoder, &resp3);
-  vn_decode_int32_t(decoder, &resp4);
-  INFO("%s: Forward RESP %d %d %d %d", __func__, resp1, resp2, resp3, resp4);
-
-  ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
-
-  thks_bye();
 }
 
+
 static virt_gpu_result_t
 virtgpu_open(struct virtgpu *gpu)
 {

From abd176f1acd772d056e9ba9c4640628e772656ea Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 14:18:42 +0200
Subject: [PATCH 034/117] ggml-remotingfrontend: build the apir framework

---
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   1 +
 .../ggml-backend-device.cpp                   | 110 +++++++++++-------
 .../ggml-backend-reg.cpp                      |  40 ++++++-
 .../src/ggml-remotingfrontend/ggml-remoting.h |   7 ++
 .../ggml-remotingfrontend/virtgpu-forward.cpp |  35 ++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |   2 +
 .../ggml-remotingfrontend/virtgpu-utils.cpp   |   2 -
 .../src/ggml-remotingfrontend/virtgpu-utils.h |   2 +
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    |  34 +++++-
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  27 +----
 10 files changed, 185 insertions(+), 75 deletions(-)
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward.h

diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 678623f972fc1..df45db51f46b3 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -13,6 +13,7 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu.cpp
                          virtgpu-shm.cpp
                          virtgpu-utils.cpp
+                         virtgpu-forward.cpp
                          ../../include/ggml-remoting-frontend.h
                         )
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index b18ce03a37121..70bb6756b315d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -1,81 +1,105 @@
 #include "ggml-remoting.h"
 
 static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return "API Remoting";
+  UNUSED(dev);
+
+  NOT_IMPLEMENTED;
+
+  return "API Remoting";
 }
 
 static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return "API Remoting device";
+  UNUSED(dev);
+
+  NOT_IMPLEMENTED;
+
+  return "API Remoting device";
 }
 
 static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_GPU;
+  UNUSED(dev);
+
+  NOT_IMPLEMENTED;
+
+  return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
 
 static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-    UNUSED(device);
-    *total = 1024*1024*1024;
-    *free = *total;
+  UNUSED(device);
+
+  NOT_IMPLEMENTED;
+
+  *total = 1024*1024*1024;
+  *free = *total;
 }
 
 static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
   UNUSED(dev);
   UNUSED(op);
 
+  //NOT_IMPLEMENTED; // to chatty
+
   return true;
 }
 
 static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    UNUSED(dev);
-    UNUSED(buft);
-    return true;
+  UNUSED(dev);
+  UNUSED(buft);
+
+  NOT_IMPLEMENTED;
+
+  return true;
 }
 
 static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-    const int min_batch_size = 32;
+  const int min_batch_size = 32;
+
+  NOT_IMPLEMENTED;
 
-    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+    (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
 
-    UNUSED(dev);
+  UNUSED(dev);
 }
 
 static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
-    UNUSED(dev);
-    return ggml_backend_remoting_host_buffer_type();
+  UNUSED(dev);
+
+  // NOT_IMPLEMENTED; // too chatty
+
+  return ggml_backend_remoting_host_buffer_type();
 }
 
 
 static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_remoting_device_get_name(dev);
-    props->description = ggml_backend_remoting_device_get_description(dev);
-    props->type        = ggml_backend_remoting_device_get_type(dev);
-    ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-        /* .async                 = */ false,
-        /* .host_buffer           = */ true,
-        /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
-    };
+
+  IMPLEMENTED;
+  props->name        = ggml_backend_remoting_device_get_name(dev);
+  props->description = ggml_backend_remoting_device_get_description(dev);
+  props->type        = ggml_backend_remoting_device_get_type(dev);
+  ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
+  props->caps = {
+    /* .async                 = */ false,
+    /* .host_buffer           = */ true,
+    /* .buffer_from_host_ptr  = */ false,
+    /* .events                = */ false,
+  };
 }
 
 const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
-    /* .get_name             = */ ggml_backend_remoting_device_get_name,
-    /* .get_description      = */ ggml_backend_remoting_device_get_description,
-    /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
-    /* .get_type             = */ ggml_backend_remoting_device_get_type,
-    /* .get_props            = */ ggml_backend_remoting_device_get_props,
-    /* .init_backend         = */ ggml_backend_remoting_device_init,
-    /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-    /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
-    /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
-    /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
-    /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
+  /* .get_name             = */ ggml_backend_remoting_device_get_name,
+  /* .get_description      = */ ggml_backend_remoting_device_get_description,
+  /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
+  /* .get_type             = */ ggml_backend_remoting_device_get_type,
+  /* .get_props            = */ ggml_backend_remoting_device_get_props,
+  /* .init_backend         = */ ggml_backend_remoting_device_init,
+  /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
+  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+  /* .buffer_from_host_ptr = */ NULL,
+  /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
+  /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
+  /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
+  /* .event_new            = */ NULL,
+  /* .event_free           = */ NULL,
+  /* .event_synchronize    = */ NULL,
 };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 93f35f7e2e26e..a0d1480508543 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -3,19 +3,53 @@
 
 #include "ggml-remoting.h"
 
-static int ggml_backend_remoting_get_device_count() {
+static struct virtgpu *apir_gpu_instance = NULL;
+
+static int apir_initialize() {
+  static bool apir_initialized = false;
+
+  if (apir_initialized) {
+    if (!apir_gpu_instance) {
+      return 0;
+    }
+    return 1;
+  }
+  apir_initialized = true;
+
+  apir_gpu_instance = create_virtgpu();
+  if (!apir_gpu_instance) {
+    FATAL("failed to initialize the virtgpu :/");
+    return 0;
+  }
+
+  apir_initialized = true;
 
   return 1;
 }
 
+static int ggml_backend_remoting_get_device_count() {
+  if (!apir_initialize()) {
+    WARNING("apir_initialize failed :/");
+    return 0;
+  }
+  IMPLEMENTED;
+
+  return apir_get_device_count(apir_gpu_instance);
+}
+
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
   UNUSED(reg);
+
+  IMPLEMENTED;
+
   return ggml_backend_remoting_get_device_count();
 }
 
 static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
   static std::vector<ggml_backend_dev_t> devices;
 
+  IMPLEMENTED;
+
   static bool initialized = false;
 
   {
@@ -23,8 +57,6 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
     std::lock_guard<std::mutex> lock(mutex);
     if (!initialized) {
 
-      create_virtgpu();
-
       for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
         ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
         char desc[256] = "API Remoting device";
@@ -48,6 +80,8 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
 
 static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
   UNUSED(reg);
+  printf("reached %s\n", __func__);
+  //thks_bye();
   return GGML_REMOTING_FRONTEND_NAME;
 }
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 7dcc6641f7574..5a20e371f6cea 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -12,6 +12,12 @@
 
 #define UNUSED GGML_UNUSED
 
+#define NOT_IMPLEMENTED \
+  printf("WARN: ### reached unimplemented function %s\n", __func__)
+
+#define IMPLEMENTED \
+  printf("INFO: ### reached implemented function %s\n", __func__)
+
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
 struct ggml_backend_remoting_device_context {
@@ -19,6 +25,7 @@ struct ggml_backend_remoting_device_context {
   std::string name;
   std::string description;
 
+  struct virtgpu *gpu;
 };
 
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_i;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
new file mode 100644
index 0000000000000..a445c64929991
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -0,0 +1,35 @@
+#include "virtgpu.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+
+#define CACHED \
+  printf("INFO: ### found response in the cache %s\n", __func__)
+
+int
+apir_get_device_count(struct virtgpu *gpu) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return dev_count;
+  }
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_COUNT;
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  vn_decode_int32_t(decoder, &dev_count);
+
+  INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return dev_count;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
new file mode 100644
index 0000000000000..28d23ededb188
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -0,0 +1,2 @@
+int
+apir_get_device_count(struct virtgpu *gpu);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
index f1af0d3391550..cedd31ddaaf9c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -8,8 +8,6 @@
 #define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1)
 #define NULL_NODE 0
 
-void thks_bye();
-
 #define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
 #define os_free_aligned(_ptr) free(_ptr)
 #define p_atomic_cmpxchg(v, old, _new) \
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index 7da90be25c380..b02c3d106f7fe 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -24,6 +24,8 @@
 
 #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
 
+void thks_bye();
+
 inline void
 INFO(const char *format, ...) {
   va_list argptr;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index f7de8a5b66b4e..679d8fcae6fe6 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -7,6 +7,25 @@
 
 #include "virtgpu.h"
 
+static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
+static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu);
+
+
+static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu);
+static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu);
+
+static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
+				      enum virgl_renderer_capset capset_id);
+static int
+virtgpu_ioctl_get_caps(struct virtgpu *gpu,
+                       enum virgl_renderer_capset id,
+                       uint32_t version,
+                       void *capset,
+                       size_t capset_size);
+static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
+static void virtgpu_init_renderer_info(struct virtgpu *gpu);
+
 static inline void
 virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
 {
@@ -33,7 +52,7 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu)
    gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D;
 }
 
-void
+struct virtgpu *
 create_virtgpu() {
   struct virtgpu *gpu = new struct virtgpu();
 
@@ -64,7 +83,7 @@ create_virtgpu() {
   struct vn_cs_decoder *decoder;
   int32_t ret;
 
-  encoder = remote_call_prepare(gpu,  VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0);
+  encoder = remote_call_prepare(gpu,  VIRGL_APIR_COMMAND_TYPE_LoadLibrary, 0);
   if (!encoder) {
     FATAL("%s: failed to prepare the remote call encoder :/", __func__);
   }
@@ -77,6 +96,8 @@ create_virtgpu() {
   if (ret != 0) {
     FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret);
   }
+
+  return gpu;
 }
 
 
@@ -352,7 +373,8 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param)
 }
 
 
-static struct vn_cs_encoder *remote_call_prepare(
+struct vn_cs_encoder *
+remote_call_prepare(
   struct virtgpu *gpu,
   int32_t cmd_type,
   int32_t cmd_flags)
@@ -395,7 +417,8 @@ static struct vn_cs_encoder *remote_call_prepare(
   return &enc;
 }
 
-static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+int32_t
+remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
   if (!enc) {
     WARNING("Invalid (null) encoder :/");
   }
@@ -410,7 +433,8 @@ static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decode
   return remote_call_ret;
 }
 
-static struct vn_cs_decoder *remote_call(
+struct vn_cs_decoder *
+remote_call(
   struct virtgpu *gpu,
   struct vn_cs_encoder *encoder
   )
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index f252e98ffd3af..5ab934ec7fb78 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -9,6 +9,7 @@
 #include <sys/stat.h>
 #include <sys/sysmacros.h>
 
+#include "virtgpu-forward.h"
 #include "virtgpu-utils.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h"
@@ -96,29 +97,11 @@ virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args)
    return drmIoctl(gpu->fd, request, args);
 }
 
-void create_virtgpu();
-static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev);
-static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu);
+struct virtgpu *create_virtgpu();
 
-
-static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu);
-static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu);
-static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu);
-
-static int virtgpu_ioctl_context_init(struct virtgpu *gpu,
-				      enum virgl_renderer_capset capset_id);
-static int
-virtgpu_ioctl_get_caps(struct virtgpu *gpu,
-                       enum virgl_renderer_capset id,
-                       uint32_t version,
-                       void *capset,
-                       size_t capset_size);
-static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param);
-static void virtgpu_init_renderer_info(struct virtgpu *gpu);
-
-static struct vn_cs_encoder *remote_call_prepare(
+struct vn_cs_encoder *remote_call_prepare(
   struct virtgpu *gpu,
   int32_t cmd_type,
   int32_t cmd_flags);
-static struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc);
-static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc);
+int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);

From 747728bea0bbb73ccdebecef74a208bc0980e1f9 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 7 May 2025 14:18:42 +0200
Subject: [PATCH 035/117] ggml-remotingbackend: build the apir framework

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |  2 +
 .../backend-dispatched.cpp                    | 68 ++++++++++++++
 .../ggml-remotingbackend/backend-dispatched.h | 30 +++++++
 .../ggml-remotingbackend/backend-internal.h   | 36 +-------
 .../ggml-remotingbackend/backend-utils.cpp    |  0
 ggml/src/ggml-remotingbackend/backend-utils.h | 53 +++++++++++
 ggml/src/ggml-remotingbackend/backend.cpp     | 89 +++++--------------
 .../shared/api_remoting.h                     |  8 +-
 .../shared/apir_backend.h                     |  8 ++
 .../ggml-remotingbackend/shared/venus_cs.h    |  5 ++
 10 files changed, 192 insertions(+), 107 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched.cpp
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched.h
 create mode 100644 ggml/src/ggml-remotingbackend/backend-utils.cpp
 create mode 100644 ggml/src/ggml-remotingbackend/backend-utils.h

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index 420e283fc8359..7435c7726beee 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -5,6 +5,8 @@ message(STATUS "Enable API Remoting backend")
 
 ggml_add_backend_library(ggml-remotingbackend
                          backend.cpp
+                         backend-dispatched.cpp
+                         backend-utils.cpp
                          shared/api_remoting.h
                          shared/apir_backend.h
                          shared/venus_cs.h
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
new file mode 100644
index 0000000000000..d6ff3421a5f6c
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -0,0 +1,68 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-remoting-backend.h"
+
+static ggml_backend_reg_t reg = NULL;
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) {
+  if (reg != NULL) {
+    FATAL("%s: already initialized :/", __func__);
+  }
+  ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
+
+  reg = ggml_backend_reg_fct();
+
+  return APIR_BACKEND_INITIALIZE_SUCCESSS;
+
+}
+
+static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+  return 0;
+}
+
+static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+  UNUSED(reg);
+
+  return GGML_REMOTING_BACKEND_NAME;
+}
+
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+  UNUSED(reg);
+  UNUSED(device);
+
+  return NULL;
+}
+
+static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
+    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
+    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_remoting_reg_i,
+        /* .context     = */ nullptr,
+    };
+
+    INFO("%s, hello :wave:", __func__);
+
+    return &reg;
+}
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  int32_t dev_count = reg->iface.get_device_count(reg);
+  vn_encode_int32_t(enc, &dev_count);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
new file mode 100644
index 0000000000000..32d9ae2a140c5
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstdint>
+#include <cstddef>
+
+#include <ggml-backend.h>
+
+#include "backend-utils.h"
+#include "shared/venus_cs.h"
+#include "shared/apir_backend.h"
+
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p);
+
+typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+
+/* *** */
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+
+static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
+{
+    switch (type) {
+    case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count";
+    default: return "unknown";
+    }
+}
+
+static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
+    [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count,
+};
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
index e6c098ed95175..8828f08aa1052 100644
--- a/ggml/src/ggml-remotingbackend/backend-internal.h
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -1,40 +1,6 @@
 #include <cstdio>
 #include <cstdarg>
-
-static inline void INFO(const char* fmt, ...) {
-  printf("INFO: ");
-  va_list args;
-  va_start(args, fmt);
-  vprintf(fmt, args);
-  va_end(args);
-
-  printf("\n");
-}
-
-static inline void ERROR(const char* fmt, ...) {
-  printf("ERROR: ");
-  va_list args;
-  va_start(args, fmt);
-  vprintf(fmt, args);
-  va_end(args);
-
-  printf("\n");
-}
-
-static inline void FATAL(const char* fmt, ...) {
-  printf("FATAL: ");
-  va_list args;
-  va_start(args, fmt);
-  vprintf(fmt, args);
-  va_end(args);
-
-  printf("\n");
-
-  if (!fmt)
-    return; // avoid the noreturn attribute
-
-  exit(1);
-}
+#include <cstdlib>
 
 extern "C" {
   uint32_t apir_backend_initialize();
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h
new file mode 100644
index 0000000000000..b032061a96947
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-utils.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstdarg>
+#include <cstdio>
+#include <cassert>
+
+#include <ggml.h>
+
+#define UNUSED GGML_UNUSED
+
+inline void
+INFO(const char *format, ...) {
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+WARNING(const char *format, ...) {
+  fprintf(stderr, "WARNING: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+ERROR(const char *format, ...) {
+  fprintf(stderr, "ERROR: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+}
+
+inline void
+FATAL(const char *format, ...) {
+  fprintf(stderr, "FATAL: ");
+
+  va_list argptr;
+  va_start(argptr, format);
+  vfprintf(stderr, format, argptr);
+  fprintf(stderr, "\n");
+  va_end(argptr);
+  if (format)
+    assert(false);
+}
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index d858b033e3c9d..7cf24471a752e 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -1,60 +1,21 @@
 #include <iostream>
 #include <dlfcn.h>
 
-#include "ggml-remoting-backend.h"
-
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-#include "ggml-backend.h"
+#include <ggml-backend.h>
 
+#include "backend-utils.h"
 #include "backend-internal.h"
+#include "backend-dispatched.h"
+
 #include "shared/apir_backend.h"
 #include "shared/venus_cs.h"
 
-#define UNUSED GGML_UNUSED
-
-static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
-  UNUSED(reg);
-  return 0;
-}
-
-static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
-  UNUSED(reg);
-  return GGML_REMOTING_BACKEND_NAME;
-}
-
-static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-  UNUSED(reg);
-  UNUSED(device);
-
-  return NULL;
-}
-
-static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
-    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
-    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
-    /* .get_proc_address = */ NULL,
-};
-
-ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
-    static ggml_backend_reg reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_remoting_reg_i,
-        /* .context     = */ nullptr,
-    };
+#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
+#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg"
 
-    INFO("%s, hello :wave:", __func__);
-
-    return &reg;
-}
-
-typedef ggml_backend_reg_t (*backend_reg_fct_t)(void);
+static void *backend_library_handle = NULL;
 
-#define GGML_BACKEND_METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
-#define GGML_BACKEND_METAL_REG_FCT_NAME "ggml_backend_metal_reg"
 
-static void *backend_library_handle = NULL;
 
 extern "C" {
   void apir_backend_deinit(void) {
@@ -69,7 +30,7 @@ extern "C" {
   uint32_t apir_backend_initialize() {
     INFO("%s: hello :wave: \\o/", __func__);
 
-    backend_library_handle = dlopen(GGML_BACKEND_METAL_LIBRARY_PATH, RTLD_LAZY);
+    backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY);
 
     if (!backend_library_handle) {
       ERROR("Cannot open library: %s\n", dlerror());
@@ -77,7 +38,7 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
     }
 
-    backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(backend_library_handle, GGML_BACKEND_METAL_REG_FCT_NAME);
+    void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME);
     const char* dlsym_error = dlerror();
     if (dlsym_error) {
       ERROR("Cannot load symbol: %s\n", dlsym_error);
@@ -85,10 +46,7 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
-    ggml_backend_reg_t reg = entrypoint_fct();
-    INFO("%s: --> %s", __func__, reg->iface.get_name(reg));
-
-    return APIR_BACKEND_INITIALIZE_SUCCESSS;
+    return backend_dispatch_initialize(ggml_backend_reg_fct);
   }
 
   uint32_t apir_backend_dispatcher(uint32_t cmd_type,
@@ -109,23 +67,18 @@ extern "C" {
     };
     struct vn_cs_decoder *dec = &_dec;
 
-    int32_t arg1, arg2, arg3;
-    vn_decode_int32_t(dec, &arg1);
-    vn_decode_int32_t(dec, &arg2);
-    vn_decode_int32_t(dec, &arg3);
-
-    INFO("%s: ARGS %d %d %d\n", __func__, arg1, arg2, arg3);
-
-    int32_t resp1 = 1;
-    int32_t resp2 = 2;
-    int32_t resp3 = 3;
-    int32_t resp4 = 4;
-    vn_encode_int32_t(enc, &resp1);
-    vn_encode_int32_t(enc, &resp2);
-    vn_encode_int32_t(enc, &resp3);
-    vn_encode_int32_t(enc, &resp4);
+
+    if (cmd_type > APIR_BACKEND_DISPATCH_TABLE_COUNT) {
+      ERROR("Received an invalid dispatch index (%d > %d)\n",
+	    cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT);
+      return APIR_BACKEND_FORWARD_INDEX_INVALID;
+    }
+
+    backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
+    uint32_t ret = forward_fct(enc, dec);
+
     *enc_cur_after = enc->cur;
 
-    return 0;
+    return ret;
   }
 }
diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
index 0cac78cccdfda..1df5498c29c03 100644
--- a/ggml/src/ggml-remotingbackend/shared/api_remoting.h
+++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
@@ -1,13 +1,13 @@
 
-#define VIRGL_VK_COMMAND_TYPE_LoadLibrary 255
-#define VIRGL_VK_COMMAND_TYPE_Forward 256
+#define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255
+#define VIRGL_APIR_COMMAND_TYPE_Forward 256
 
 
 static inline const char *api_remoting_command_name(int32_t type)
 {
   switch (type) {
-  case VIRGL_VK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
-  case VIRGL_VK_COMMAND_TYPE_Forward: return "Forward";
+  case VIRGL_APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary";
+  case VIRGL_APIR_COMMAND_TYPE_Forward: return "Forward";
   default: return "unknown";
   }
 }
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 8506ffa46b759..c5a9dbd05e8dd 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -11,6 +11,8 @@
 #define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
 #define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
 
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 5
+
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);
 
@@ -19,3 +21,9 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
 					    char *enc_cur, const char *enc_end,
 					    char **enc_cur_after
   );
+
+typedef enum ApirBackendCommandType {
+    APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0,
+} ApirBackendCommandType;
+
+#define APIR_BACKEND_DISPATCH_TABLE_COUNT 1 // last command_type index + 1
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index d9397c6d5d647..5a3ed16ad4100 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -1,5 +1,10 @@
 #pragma once
 
+#include <cassert>
+#include <cstring>
+
+// needs FATAL to be defined
+
 #define likely(x)   __builtin_expect(!!(x), 1)
 #define unlikely(x) __builtin_expect(!!(x), 0)
 

From 00be43f69276646391100bf66d41fb19a8f8a52d Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 11:45:47 +0200
Subject: [PATCH 036/117] Add support for device name and description

---
 .../backend-dispatched.cpp                    |  30 +++
 .../ggml-remotingbackend/backend-dispatched.h |   6 +
 .../shared/apir_backend.h                     |   6 +-
 .../ggml-remotingbackend/shared/venus_cs.h    | 241 ++++++++++++------
 .../ggml-backend-device.cpp                   |  16 +-
 .../ggml-backend-reg.cpp                      |  52 ++--
 .../src/ggml-remotingfrontend/ggml-remoting.h |   6 +-
 .../ggml-remotingfrontend/virtgpu-forward.cpp |  72 ++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |   5 +-
 .../src/ggml-remotingfrontend/virtgpu-utils.h |   4 +
 10 files changed, 319 insertions(+), 119 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index d6ff3421a5f6c..9cee43e751ca4 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -8,6 +8,7 @@
 #include "ggml-remoting-backend.h"
 
 static ggml_backend_reg_t reg = NULL;
+static ggml_backend_dev_t dev = NULL;
 
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) {
   if (reg != NULL) {
@@ -16,6 +17,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) {
   ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
 
   reg = ggml_backend_reg_fct();
+  if (reg->iface.get_device_count(reg)) {
+    dev = reg->iface.get_device(reg, 0);
+  }
 
   return APIR_BACKEND_INITIALIZE_SUCCESSS;
 
@@ -66,3 +70,29 @@ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_de
 
   return 0;
 }
+
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_name(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+
+uint32_t
+backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_description(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 32d9ae2a140c5..39a1d2ffa9881 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -16,15 +16,21 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d
 /* *** */
 
 uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
     switch (type) {
     case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count";
+    case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name";
+    case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description";
     default: return "unknown";
     }
 }
 
 static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
     [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count,
+    [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name,
+    [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index c5a9dbd05e8dd..f8183c8f0f731 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -24,6 +24,8 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
 
 typedef enum ApirBackendCommandType {
     APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0,
-} ApirBackendCommandType;
+    APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1,
+    APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2,
 
-#define APIR_BACKEND_DISPATCH_TABLE_COUNT 1 // last command_type index + 1
+    APIR_BACKEND_DISPATCH_TABLE_COUNT = 3, // last command_type index + 1
+} ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 5a3ed16ad4100..ebcab98a449f4 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -3,6 +3,7 @@
 #include <cassert>
 #include <cstring>
 
+// needs UNUSED to be defined
 // needs FATAL to be defined
 
 #define likely(x)   __builtin_expect(!!(x), 1)
@@ -29,18 +30,18 @@ vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
                             void *val,
                             size_t val_size)
 {
-   assert(val_size <= size);
+  assert(val_size <= size);
 
-   if (unlikely(size > (size_t) (dec->end - dec->cur))) {
-      FATAL("DECODER IS FULL :/");
-      //vn_cs_decoder_set_fatal(dec);
-      memset(val, 0, val_size);
-      return false;
-   }
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("DECODER IS FULL :/");
+    //vn_cs_decoder_set_fatal(dec);
+    memset(val, 0, val_size);
+    return false;
+  }
 
-   /* we should not rely on the compiler to optimize away memcpy... */
-   memcpy(val, dec->cur, val_size);
-   return true;
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(val, dec->cur, val_size);
+  return true;
 }
 
 static inline void
@@ -49,7 +50,7 @@ vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
                    void *val,
                    size_t val_size)
 {
-   vn_cs_decoder_peek_internal(dec, size, val, val_size);
+  vn_cs_decoder_peek_internal(dec, size, val, val_size);
 }
 
 /*
@@ -62,8 +63,8 @@ vn_cs_decoder_read(struct vn_cs_decoder *dec,
                    void *val,
                    size_t val_size)
 {
-   if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
-      dec->cur += size;
+  if (vn_cs_decoder_peek_internal(dec, size, val, val_size))
+    dec->cur += size;
 }
 
 static inline void
@@ -72,12 +73,12 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc,
                     const void *val,
                     size_t val_size)
 {
-   assert(val_size <= size);
-   assert(size <= ((size_t) (enc->end - enc->cur)));
+  assert(val_size <= size);
+  assert(size <= ((size_t) (enc->end - enc->cur)));
 
-   /* we should not rely on the compiler to optimize away memcpy... */
-   memcpy(enc->cur, val, val_size);
-   enc->cur += size;
+  /* we should not rely on the compiler to optimize away memcpy... */
+  memcpy(enc->cur, val, val_size);
+  enc->cur += size;
 }
 
 /*
@@ -87,16 +88,16 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc,
 static inline void
 vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size)
 {
-   assert(size % 4 == 0);
-   vn_cs_decoder_read(dec, size, data, data_size);
+  assert(size % 4 == 0);
+  vn_cs_decoder_read(dec, size, data, data_size);
 }
 
 static inline void
 vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size)
 {
-   assert(size % 4 == 0);
-   /* TODO check if the generated code is optimal */
-   vn_cs_encoder_write(enc, size, data, data_size);
+  assert(size % 4 == 0);
+  /* TODO check if the generated code is optimal */
+  vn_cs_encoder_write(enc, size, data, data_size);
 }
 
 /*
@@ -108,45 +109,45 @@ vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_
 static inline size_t
 vn_sizeof_uint64_t(const uint64_t *val)
 {
-    assert(sizeof(*val) == 8);
-    return 8;
+  assert(sizeof(*val) == 8);
+  return 8;
 }
 
 static inline void
 vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val)
 {
-    vn_encode(enc, 8, val, sizeof(*val));
+  vn_encode(enc, 8, val, sizeof(*val));
 }
 
 static inline void
 vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val)
 {
-    vn_decode(dec, 8, val, sizeof(*val));
+  vn_decode(dec, 8, val, sizeof(*val));
 }
 
 static inline size_t
 vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count)
 {
-    assert(sizeof(*val) == 8);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
+  assert(sizeof(*val) == 8);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
 }
 
 static inline void
 vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
 }
 
 static inline void
 vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
 }
 
 /* int32_t */
@@ -154,45 +155,45 @@ vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t coun
 static inline size_t
 vn_sizeof_int32_t(const int32_t *val)
 {
-    assert(sizeof(*val) == 4);
-    return 4;
+  assert(sizeof(*val) == 4);
+  return 4;
 }
 
 static inline void
 vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val)
 {
-    vn_encode(enc, 4, val, sizeof(*val));
+  vn_encode(enc, 4, val, sizeof(*val));
 }
 
 static inline void
 vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val)
 {
-    vn_decode(dec, 4, val, sizeof(*val));
+  vn_decode(dec, 4, val, sizeof(*val));
 }
 
 static inline size_t
 vn_sizeof_int32_t_array(const int32_t *val, uint32_t count)
 {
-    assert(sizeof(*val) == 4);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
 }
 
 static inline void
 vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
 }
 
 static inline void
 vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
 }
 
 /* array size (uint64_t) */
@@ -200,42 +201,42 @@ vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count)
 static inline size_t
 vn_sizeof_array_size(uint64_t size)
 {
-    return vn_sizeof_uint64_t(&size);
+  return vn_sizeof_uint64_t(&size);
 }
 
 static inline void
 vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size)
 {
-    vn_encode_uint64_t(enc, &size);
+  vn_encode_uint64_t(enc, &size);
 }
 
 static inline uint64_t
 vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size)
 {
-    uint64_t size;
-    vn_decode_uint64_t(dec, &size);
-    if (size != expected_size) {
-        FATAL("ENCODER IS FULL :/");
-        //vn_cs_decoder_set_fatal(dec);
-        size = 0;
-    }
-    return size;
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  if (size != expected_size) {
+    FATAL("ENCODER IS FULL :/");
+    //vn_cs_decoder_set_fatal(dec);
+    size = 0;
+  }
+  return size;
 }
 
 static inline uint64_t
 vn_decode_array_size_unchecked(struct vn_cs_decoder *dec)
 {
-    uint64_t size;
-    vn_decode_uint64_t(dec, &size);
-    return size;
+  uint64_t size;
+  vn_decode_uint64_t(dec, &size);
+  return size;
 }
 
 static inline uint64_t
 vn_peek_array_size(struct vn_cs_decoder *dec)
 {
-    uint64_t size;
-    vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
-    return size;
+  uint64_t size;
+  vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size));
+  return size;
 }
 
 /* non-array pointer */
@@ -243,20 +244,20 @@ vn_peek_array_size(struct vn_cs_decoder *dec)
 static inline size_t
 vn_sizeof_simple_pointer(const void *val)
 {
-    return vn_sizeof_array_size(val ? 1 : 0);
+  return vn_sizeof_array_size(val ? 1 : 0);
 }
 
 static inline bool
 vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val)
 {
-    vn_encode_array_size(enc, val ? 1 : 0);
-    return val;
+  vn_encode_array_size(enc, val ? 1 : 0);
+  return val;
 }
 
 static inline bool
 vn_decode_simple_pointer(struct vn_cs_decoder *dec)
 {
-    return vn_decode_array_size_unchecked(dec);
+  return vn_decode_array_size_unchecked(dec);
 }
 
 /* uint32_t */
@@ -264,43 +265,113 @@ vn_decode_simple_pointer(struct vn_cs_decoder *dec)
 static inline size_t
 vn_sizeof_uint32_t(const uint32_t *val)
 {
-    assert(sizeof(*val) == 4);
-    return 4;
+  assert(sizeof(*val) == 4);
+  return 4;
 }
 
 static inline void
 vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val)
 {
-    vn_encode(enc, 4, val, sizeof(*val));
+  vn_encode(enc, 4, val, sizeof(*val));
 }
 
 static inline void
 vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val)
 {
-    vn_decode(dec, 4, val, sizeof(*val));
+  vn_decode(dec, 4, val, sizeof(*val));
 }
 
 static inline size_t
 vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count)
 {
-    assert(sizeof(*val) == 4);
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    return size;
+  assert(sizeof(*val) == 4);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  return size;
 }
 
 static inline void
 vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_encode(enc, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_encode(enc, size, val, size);
 }
 
 static inline void
 vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count)
 {
-    const size_t size = sizeof(*val) * count;
-    assert(size >= count);
-    vn_decode(dec, size, val, size);
+  const size_t size = sizeof(*val) * count;
+  assert(size >= count);
+  vn_decode(dec, size, val, size);
+}
+
+/* opaque blob */
+
+static inline size_t
+vn_sizeof_blob_array(const void *val, size_t size)
+{
+  UNUSED(val);
+  return (size + 3) & ~3;
+}
+
+static inline void
+vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size)
+{
+  vn_encode(enc, (size + 3) & ~3, val, size);
+}
+
+static inline void
+vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size)
+{
+  vn_decode(dec, (size + 3) & ~3, val, size);
+}
+
+/* string */
+
+static inline size_t
+vn_sizeof_char_array(const char *val, size_t size)
+{
+  return vn_sizeof_blob_array(val, size);
+}
+
+static inline void
+vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size)
+{
+  assert(size && strlen(val) < size);
+  vn_encode_blob_array(enc, val, size);
+}
+
+static inline void
+vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size)
+{
+  vn_decode_blob_array(dec, val, size);
+  if (size)
+    val[size - 1] = '\0';
+  else {
+    //vn_cs_decoder_set_fatal(dec);
+    FATAL("Couldn't decode the blog array");
+  }
+}
+
+/* (temp) buffer allocation */
+
+static inline void *
+vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count)
+{
+  UNUSED(dec);
+  size_t alloc_size;
+  if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) {
+    FATAL("overflow in array allocation of %zu * %zu bytes", size, count);
+    return NULL;
+  }
+
+  return malloc(alloc_size);
+}
+
+static inline void *
+vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count)
+{
+  struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec;
+  return vkr_cs_decoder_alloc_array(d, size, count);
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 70bb6756b315d..f84e6bd1d2f03 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -1,19 +1,19 @@
 #include "ggml-remoting.h"
 
-static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
-  UNUSED(dev);
+static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return "API Remoting";
+  return apir_get_device_name(gpu);
 }
 
-static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
-  UNUSED(dev);
+static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return "API Remoting device";
+  return apir_get_device_description(gpu);
 }
 
 static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index a0d1480508543..216c69ced375b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -3,38 +3,36 @@
 
 #include "ggml-remoting.h"
 
-static struct virtgpu *apir_gpu_instance = NULL;
-
-static int apir_initialize() {
+static struct virtgpu *apir_initialize() {
+  static struct virtgpu *apir_gpu_instance = NULL;
   static bool apir_initialized = false;
 
   if (apir_initialized) {
-    if (!apir_gpu_instance) {
-      return 0;
-    }
-    return 1;
+    return apir_gpu_instance;
   }
   apir_initialized = true;
 
   apir_gpu_instance = create_virtgpu();
   if (!apir_gpu_instance) {
     FATAL("failed to initialize the virtgpu :/");
-    return 0;
+    return NULL;
   }
 
   apir_initialized = true;
 
-  return 1;
+  return apir_gpu_instance;
 }
 
 static int ggml_backend_remoting_get_device_count() {
-  if (!apir_initialize()) {
+  IMPLEMENTED;
+
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
     WARNING("apir_initialize failed :/");
     return 0;
   }
-  IMPLEMENTED;
 
-  return apir_get_device_count(apir_gpu_instance);
+  return apir_get_device_count(gpu);
 }
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -42,7 +40,13 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg)
 
   IMPLEMENTED;
 
-  return ggml_backend_remoting_get_device_count();
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    WARNING("apir_initialize failed :/");
+    return 0;
+  }
+
+  return apir_get_device_count(gpu);
 }
 
 static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
@@ -50,6 +54,12 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
 
   IMPLEMENTED;
 
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    WARNING("apir_initialize failed :/");
+    return 0;
+  }
+
   static bool initialized = false;
 
   {
@@ -58,12 +68,14 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
     if (!initialized) {
 
       for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
-        ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context;
+        ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context;
         char desc[256] = "API Remoting device";
 
         ctx->device = i;
         ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i);
         ctx->description = desc;
+	ctx->gpu = gpu;
+
         devices.push_back(new ggml_backend_device {
             /* .iface   = */ ggml_backend_remoting_device_i,
             /* .reg     = */ reg,
@@ -78,10 +90,9 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
   return devices[device];
 }
 
-static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
+static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
   UNUSED(reg);
-  printf("reached %s\n", __func__);
-  //thks_bye();
+
   return GGML_REMOTING_FRONTEND_NAME;
 }
 
@@ -93,10 +104,15 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
+  struct virtgpu *gpu = apir_initialize();
+  if (!gpu) {
+    FATAL("apir_initialize failed :/");
+    return NULL;
+  }
   static ggml_backend_reg reg = {
     /* .api_version = */ GGML_BACKEND_API_VERSION,
     /* .iface       = */ ggml_backend_remoting_reg_i,
-    /* .context     = */ nullptr,
+    /* .context     = */ gpu,
   };
 
   RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 5a20e371f6cea..c314623d809ab 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -10,13 +10,11 @@
 #include "ggml-backend.h"
 #include "virtgpu.h"
 
-#define UNUSED GGML_UNUSED
-
 #define NOT_IMPLEMENTED \
   printf("WARN: ### reached unimplemented function %s\n", __func__)
 
-#define IMPLEMENTED \
-  printf("INFO: ### reached implemented function %s\n", __func__)
+#define IMPLEMENTED
+//  printf("INFO: ### reached implemented function %s\n", __func__)
 
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
index a445c64929991..04167a676e9a7 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -33,3 +33,75 @@ apir_get_device_count(struct virtgpu *gpu) {
 
   return dev_count;
 }
+
+
+const char *
+apir_get_device_name(struct virtgpu *gpu) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return "Nothing";
+  }
+
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_NAME;
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV NAME --> %s", __func__, string);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return string;
+}
+
+const char *
+apir_get_device_description(struct virtgpu *gpu) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return "Nothing";
+  }
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION;
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device description buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward DEV DESCR --> %s", __func__, string);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return string;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 28d23ededb188..383fd2ea5a642 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -1,2 +1,3 @@
-int
-apir_get_device_count(struct virtgpu *gpu);
+int apir_get_device_count(struct virtgpu *gpu);
+const char *apir_get_device_name(struct virtgpu *gpu);
+const char *apir_get_device_description(struct virtgpu *gpu);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index b02c3d106f7fe..a6bd5df92ea6f 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -13,6 +13,10 @@
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #define likely(x) __builtin_expect(!!(x), 1)
 
+#ifndef UNUSED
+#define UNUSED(x) (void)(x)
+#endif
+
 /** Checks is a value is a power of two. Does not handle zero. */
 #define IS_POT(v) (((v) & ((v) - 1)) == 0)
 

From 3dd26d10c6c29c0df5c52b0226f00a4302443bb1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:28:35 +0200
Subject: [PATCH 037/117] ggml: src: ggml-metal/ggml-metal: make less verbose

---
 ggml/src/ggml-metal/ggml-metal.m | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index f226826020a5a..97f426cbd3e13 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -8,6 +8,9 @@
 
 #import <Metal/Metal.h>
 
+#undef GGML_LOG_DEBUG
+#define GGML_LOG_DEBUG(...)
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -776,8 +779,6 @@ @implementation GGMLMetalClass
                 GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                 return NULL; \
             } \
-        } else { \
-            GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \
         }
 
         const bool has_simdgroup_mm        = ctx_dev->has_simdgroup_mm;

From 11f65c5f42ae3cf94707b260d18cdd08a6fd8f96 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:30:20 +0200
Subject: [PATCH 038/117] ggml-remotingbackend: include the ggml backend
 initialization

---
 .../backend-dispatched.cpp                     | 18 ++++++++++++++++--
 .../ggml-remotingbackend/backend-dispatched.h  |  2 +-
 ggml/src/ggml-remotingbackend/backend.cpp      | 15 +++++++++++++--
 .../ggml-remotingbackend/shared/apir_backend.h |  3 ++-
 4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index 9cee43e751ca4..f6849ccf58c3b 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -7,22 +7,36 @@
 #include "ggml-backend.h"
 #include "ggml-remoting-backend.h"
 
+#include "ggml-metal.h"
+
 static ggml_backend_reg_t reg = NULL;
 static ggml_backend_dev_t dev = NULL;
+static ggml_backend_t bck = NULL;
 
-uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) {
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
   if (reg != NULL) {
     FATAL("%s: already initialized :/", __func__);
   }
   ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p;
 
   reg = ggml_backend_reg_fct();
+  if (reg == NULL) {
+    FATAL("%s: backend registration failed :/", __func__);
+  }
+
   if (reg->iface.get_device_count(reg)) {
     dev = reg->iface.get_device(reg, 0);
   }
 
-  return APIR_BACKEND_INITIALIZE_SUCCESSS;
+  ggml_backend_t (* ggml_backend_fct)(void) = (ggml_backend_t (*)()) ggml_backend_init_fct_p;
 
+  bck = ggml_backend_fct();
+  if (!bck) {
+    ERROR("%s: backend initialization failed :/", __func__);
+    return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
+  }
+
+  return APIR_BACKEND_INITIALIZE_SUCCESSS;
 }
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 39a1d2ffa9881..86c8c7618861b 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -9,7 +9,7 @@
 #include "shared/venus_cs.h"
 #include "shared/apir_backend.h"
 
-uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p);
+uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
 
 typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 7cf24471a752e..4bafac5c28e9a 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -12,6 +12,7 @@
 
 #define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
 #define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg"
+#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init"
 
 static void *backend_library_handle = NULL;
 
@@ -28,6 +29,8 @@ extern "C" {
   }
 
   uint32_t apir_backend_initialize() {
+    const char* dlsym_error;
+
     INFO("%s: hello :wave: \\o/", __func__);
 
     backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY);
@@ -39,14 +42,22 @@ extern "C" {
     }
 
     void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME);
-    const char* dlsym_error = dlerror();
+    dlsym_error = dlerror();
+    if (dlsym_error) {
+      ERROR("Cannot load symbol: %s\n", dlsym_error);
+
+      return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
+    }
+
+    void *ggml_backend_init_fct = dlsym(backend_library_handle, GGML_BACKEND_INIT_FCT_NAME);
+    dlsym_error = dlerror();
     if (dlsym_error) {
       ERROR("Cannot load symbol: %s\n", dlsym_error);
 
       return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
-    return backend_dispatch_initialize(ggml_backend_reg_fct);
+    return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
   }
 
   uint32_t apir_backend_dispatcher(uint32_t cmd_type,
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index f8183c8f0f731..08050cfc18c92 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -10,8 +10,9 @@
 #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
 #define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3
 #define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4
+#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5
 
-#define APIR_BACKEND_FORWARD_INDEX_INVALID 5
+#define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);

From f9a01ef01efe2cb423cf5300baf0c647104f83ae Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:31:16 +0200
Subject: [PATCH 039/117] remoting: include device_get_type and
 device_get_memory

---
 .../backend-dispatched.cpp                    | 24 +++++-
 .../ggml-remotingbackend/backend-dispatched.h |  6 ++
 .../shared/apir_backend.h                     |  4 +-
 .../ggml-backend-device.cpp                   | 15 ++--
 .../ggml-remotingfrontend/virtgpu-forward.cpp | 74 +++++++++++++++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |  2 +
 6 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index f6849ccf58c3b..d00a015c99d61 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -97,7 +97,6 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder
   return 0;
 }
 
-
 uint32_t
 backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
   UNUSED(dec);
@@ -110,3 +109,26 @@ backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 
   return 0;
 }
+
+uint32_t
+backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  uint32_t type = dev->iface.get_type(dev);
+  vn_encode_uint32_t(enc, &type);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+
+  vn_encode_size_t(enc, &free);
+  vn_encode_size_t(enc, &total);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 86c8c7618861b..beeec4ee566fe 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -18,6 +18,8 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d
 uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
@@ -25,6 +27,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
     case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count";
     case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name";
     case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description";
+    case APIR_COMMAND_TYPE_GET_DEVICE_TYPE: return "backend_reg__get_device_type";
+    case APIR_COMMAND_TYPE_GET_DEVICE_MEMORY: return "backend_reg__get_device_memory";
     default: return "unknown";
     }
 }
@@ -33,4 +37,6 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
     [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count,
     [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name,
     [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description,
+    [APIR_COMMAND_TYPE_GET_DEVICE_TYPE] = backend_device_get_type,
+    [APIR_COMMAND_TYPE_GET_DEVICE_MEMORY] = backend_device_get_memory,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 08050cfc18c92..8733b53611502 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -27,6 +27,8 @@ typedef enum ApirBackendCommandType {
     APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0,
     APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1,
     APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2,
+    APIR_COMMAND_TYPE_GET_DEVICE_TYPE = 3,
+    APIR_COMMAND_TYPE_GET_DEVICE_MEMORY = 4,
 
-    APIR_BACKEND_DISPATCH_TABLE_COUNT = 3, // last command_type index + 1
+    APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index f84e6bd1d2f03..55093ae246506 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -17,20 +17,19 @@ static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev
 }
 
 static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
-  UNUSED(dev);
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return GGML_BACKEND_DEVICE_TYPE_GPU;
+  return (enum ggml_backend_dev_type) apir_get_device_type(gpu);
 }
 
-static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
-  UNUSED(device);
+static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  *total = 1024*1024*1024;
-  *free = *total;
+  return apir_get_device_memory(gpu, free, total);
 }
 
 static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
index 04167a676e9a7..617299541f148 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -105,3 +105,77 @@ apir_get_device_description(struct virtgpu *gpu) {
 
   return string;
 }
+
+uint32_t
+apir_get_device_type(struct virtgpu *gpu) {
+  static uint32_t dev_type = 255;
+  if (dev_type != 255) {
+    CACHED;
+    return dev_type;
+  }
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_TYPE;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  vn_decode_uint32_t(decoder, &dev_type);
+
+  INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return dev_type;
+}
+
+void
+apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
+  static size_t dev_free = 0;
+  static size_t dev_total = 0;
+  /*
+  if (dev_total != 0) {
+    WARNING("Not sure if llama.cpp expects fresh information for the free memory ...");
+    *free = dev_free;
+    *total = dev_total;
+
+    CACHED;
+    return;
+  }
+  */
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_MEMORY;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  vn_decode_size_t(decoder, &dev_free);
+  vn_decode_size_t(decoder, &dev_total);
+
+  *free = dev_free;
+  *total = dev_total;
+
+  INFO("%s: Forward DEV FREE  mem --> %zu MB", __func__, dev_free / 1024 / 1024);
+  INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 383fd2ea5a642..13b523b2d3fbf 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -1,3 +1,5 @@
 int apir_get_device_count(struct virtgpu *gpu);
 const char *apir_get_device_name(struct virtgpu *gpu);
 const char *apir_get_device_description(struct virtgpu *gpu);
+uint32_t apir_get_device_type(struct virtgpu *gpu);
+void apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total);

From 2461dc9194545e8a140a3f0b6397c37518e35ec3 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:31:45 +0200
Subject: [PATCH 040/117] ggml: src: ggml-remotingbackend/backend: make less
 verbose

---
 ggml/src/ggml-remotingbackend/backend.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 4bafac5c28e9a..9a97b97a71f7c 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -64,8 +64,6 @@ extern "C" {
 				   char *dec_cur, const char *dec_end,
 				   char *enc_cur, const char *enc_end,
 				   char **enc_cur_after) {
-    INFO("%s: --> %d | %p | %p ", __func__, cmd_type, dec_cur, enc_cur);
-
     struct vn_cs_encoder _enc = {
       .cur = enc_cur,
       .end = enc_end,

From 9ba6e061860fc37377bb9decd5553332c61f74a0 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:32:06 +0200
Subject: [PATCH 041/117] shared: venus_cs: add more CS functions

---
 .../ggml-remotingbackend/shared/venus_cs.h    | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index ebcab98a449f4..bb9cc99b7262c 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -306,6 +306,57 @@ vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t coun
   vn_decode(dec, size, val, size);
 }
 
+/* size_t */
+
+static inline size_t
+vn_sizeof_size_t(const size_t *val)
+{
+    return sizeof(*val);
+}
+
+static inline void
+vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val)
+{
+    const uint64_t tmp = *val;
+    vn_encode_uint64_t(enc, &tmp);
+}
+
+static inline void
+vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val)
+{
+    uint64_t tmp;
+    vn_decode_uint64_t(dec, &tmp);
+    *val = tmp;
+}
+
+static inline size_t
+vn_sizeof_size_t_array(const size_t *val, uint32_t count)
+{
+    return vn_sizeof_size_t(val) * count;
+}
+
+static inline void
+vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_encode_uint64_t_array(enc, (const uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_encode_size_t(enc, &val[i]);
+    }
+}
+
+static inline void
+vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count)
+{
+    if (sizeof(size_t) == sizeof(uint64_t)) {
+        vn_decode_uint64_t_array(dec, (uint64_t *)val, count);
+    } else {
+        for (uint32_t i = 0; i < count; i++)
+            vn_decode_size_t(dec, &val[i]);
+    }
+}
+
 /* opaque blob */
 
 static inline size_t

From 9d523959b4e7c8598f5dfeca4c13190593bf9c08 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:32:29 +0200
Subject: [PATCH 042/117] ggml: src: ggml-remotingfrontend/ggml-remoting: make
 the NOT_IMPLEMENTED warning more visible

---
 ggml/src/ggml-remotingfrontend/ggml-remoting.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index c314623d809ab..986caef3f407a 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -10,8 +10,14 @@
 #include "ggml-backend.h"
 #include "virtgpu.h"
 
-#define NOT_IMPLEMENTED \
-  printf("WARN: ### reached unimplemented function %s\n", __func__)
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
 
 #define IMPLEMENTED
 //  printf("INFO: ### reached implemented function %s\n", __func__)

From 95ccc1a0276b485e3b03a7b7e9bc6110e5e03e77 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 13:32:44 +0200
Subject: [PATCH 043/117] ggml: src: ggml-remotingfrontend/virtgpu-forward:
 make less verbose

---
 ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
index 617299541f148..59739cb0ff30f 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -1,8 +1,8 @@
 #include "virtgpu.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
 
-#define CACHED \
-  printf("INFO: ### found response in the cache %s\n", __func__)
+#define CACHED
+//  printf("INFO: ### found response in the cache %s\n", __func__)
 
 int
 apir_get_device_count(struct virtgpu *gpu) {

From ad578113ce1164bc880c2d8c7a47646de97abbf5 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 14:12:26 +0200
Subject: [PATCH 044/117] remoting: correct the device_get_* name order

---
 .../ggml-remotingbackend/backend-dispatched.h | 20 +++++++++----------
 .../shared/apir_backend.h                     | 10 +++++-----
 .../ggml-backend-device.cpp                   | 10 ++++------
 .../ggml-backend-reg.cpp                      |  4 ++--
 .../ggml-remotingfrontend/virtgpu-forward.cpp | 20 +++++++++----------
 .../ggml-remotingfrontend/virtgpu-forward.h   | 10 +++++-----
 6 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index beeec4ee566fe..6026b9537a1e6 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -24,19 +24,19 @@ uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decod
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
     switch (type) {
-    case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count";
-    case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name";
-    case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description";
-    case APIR_COMMAND_TYPE_GET_DEVICE_TYPE: return "backend_reg__get_device_type";
-    case APIR_COMMAND_TYPE_GET_DEVICE_MEMORY: return "backend_reg__get_device_memory";
+    case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count";
+    case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name";
+    case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
+    case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
+    case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
     default: return "unknown";
     }
 }
 
 static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
-    [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count,
-    [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name,
-    [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description,
-    [APIR_COMMAND_TYPE_GET_DEVICE_TYPE] = backend_device_get_type,
-    [APIR_COMMAND_TYPE_GET_DEVICE_MEMORY] = backend_device_get_memory,
+    [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count,
+    [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name,
+    [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
+    [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
+    [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 8733b53611502..4eb7816ce8ed0 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -24,11 +24,11 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
   );
 
 typedef enum ApirBackendCommandType {
-    APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0,
-    APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1,
-    APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2,
-    APIR_COMMAND_TYPE_GET_DEVICE_TYPE = 3,
-    APIR_COMMAND_TYPE_GET_DEVICE_MEMORY = 4,
+    APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
+    APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1,
+    APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
+    APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
+    APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
 
     APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 55093ae246506..47227e63d97e4 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -5,7 +5,7 @@ static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev)
 
   struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return apir_get_device_name(gpu);
+  return apir_device_get_name(gpu);
 }
 
 static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
@@ -13,7 +13,7 @@ static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev
 
   struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return apir_get_device_description(gpu);
+  return apir_device_get_description(gpu);
 }
 
 static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
@@ -21,7 +21,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac
 
   struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return (enum ggml_backend_dev_type) apir_get_device_type(gpu);
+  return (enum ggml_backend_dev_type) apir_device_get_type(gpu);
 }
 
 static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -29,15 +29,13 @@ static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size
 
   struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
 
-  return apir_get_device_memory(gpu, free, total);
+  return apir_device_get_memory(gpu, free, total);
 }
 
 static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
   UNUSED(dev);
   UNUSED(op);
 
-  //NOT_IMPLEMENTED; // to chatty
-
   return true;
 }
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 216c69ced375b..06bcb0310cbc6 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -32,7 +32,7 @@ static int ggml_backend_remoting_get_device_count() {
     return 0;
   }
 
-  return apir_get_device_count(gpu);
+  return apir_device_get_count(gpu);
 }
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -46,7 +46,7 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg)
     return 0;
   }
 
-  return apir_get_device_count(gpu);
+  return apir_device_get_count(gpu);
 }
 
 static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
index 59739cb0ff30f..134ca8f58ad1a 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -5,13 +5,13 @@
 //  printf("INFO: ### found response in the cache %s\n", __func__)
 
 int
-apir_get_device_count(struct virtgpu *gpu) {
+apir_device_get_count(struct virtgpu *gpu) {
   static int32_t dev_count = -1;
   if (dev_count != -1) {
     CACHED;
     return dev_count;
   }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_COUNT;
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_COUNT;
   struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
   if (!encoder) {
     FATAL("%s: failed to prepare the remote call encoder :/", __func__);
@@ -36,14 +36,14 @@ apir_get_device_count(struct virtgpu *gpu) {
 
 
 const char *
-apir_get_device_name(struct virtgpu *gpu) {
+apir_device_get_name(struct virtgpu *gpu) {
   static int32_t dev_count = -1;
   if (dev_count != -1) {
     CACHED;
     return "Nothing";
   }
 
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_NAME;
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_NAME;
   struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
   if (!encoder) {
     FATAL("%s: failed to prepare the remote call encoder :/", __func__);
@@ -72,13 +72,13 @@ apir_get_device_name(struct virtgpu *gpu) {
 }
 
 const char *
-apir_get_device_description(struct virtgpu *gpu) {
+apir_device_get_description(struct virtgpu *gpu) {
   static int32_t dev_count = -1;
   if (dev_count != -1) {
     CACHED;
     return "Nothing";
   }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION;
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION;
   struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
   if (!encoder) {
     FATAL("%s: failed to prepare the remote call encoder :/", __func__);
@@ -107,13 +107,13 @@ apir_get_device_description(struct virtgpu *gpu) {
 }
 
 uint32_t
-apir_get_device_type(struct virtgpu *gpu) {
+apir_device_get_type(struct virtgpu *gpu) {
   static uint32_t dev_type = 255;
   if (dev_type != 255) {
     CACHED;
     return dev_type;
   }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_TYPE;
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_TYPE;
 
   struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
   if (!encoder) {
@@ -138,7 +138,7 @@ apir_get_device_type(struct virtgpu *gpu) {
 }
 
 void
-apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
+apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
   static size_t dev_free = 0;
   static size_t dev_total = 0;
   /*
@@ -151,7 +151,7 @@ apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
     return;
   }
   */
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_MEMORY;
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_MEMORY;
 
   struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
   if (!encoder) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 13b523b2d3fbf..2edade8f289f1 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -1,5 +1,5 @@
-int apir_get_device_count(struct virtgpu *gpu);
-const char *apir_get_device_name(struct virtgpu *gpu);
-const char *apir_get_device_description(struct virtgpu *gpu);
-uint32_t apir_get_device_type(struct virtgpu *gpu);
-void apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total);
+int apir_device_get_count(struct virtgpu *gpu);
+const char *apir_device_get_name(struct virtgpu *gpu);
+const char *apir_device_get_description(struct virtgpu *gpu);
+uint32_t apir_device_get_type(struct virtgpu *gpu);
+void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);

From 1d9d44d9534d42d1adae6703469475d0f0aaf58e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 15:58:04 +0200
Subject: [PATCH 045/117] remoting: add support for device_supports_op

---
 .../backend-dispatched.cpp                    | 11 +++++++
 .../ggml-remotingbackend/backend-dispatched.h |  4 +++
 .../shared/apir_backend.h                     |  3 +-
 .../ggml-remotingbackend/shared/venus_cs.h    | 26 ++++++++++++++++
 .../ggml-backend-device.cpp                   |  7 +++--
 .../ggml-remotingfrontend/virtgpu-forward.cpp | 31 +++++++++++++++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |  3 ++
 7 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index d00a015c99d61..91d8ac4bd6fc2 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -132,3 +132,14 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec)
 
   return 0;
 }
+
+uint32_t
+backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
+
+  bool supports_op = dev->iface.supports_op(dev, op);
+
+  vn_encode_bool_t(enc, &supports_op);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 6026b9537a1e6..4974d5222ddb0 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -7,6 +7,7 @@
 
 #include "backend-utils.h"
 #include "shared/venus_cs.h"
+#include "shared/venus_cs_ggml.h"
 #include "shared/apir_backend.h"
 
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
@@ -20,6 +21,7 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder
 uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
@@ -29,6 +31,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
     case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
     case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
     case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
+    case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
     default: return "unknown";
     }
 }
@@ -39,4 +42,5 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
     [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
     [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
     [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
+    [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 4eb7816ce8ed0..6949aa5429ca3 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -29,6 +29,7 @@ typedef enum ApirBackendCommandType {
     APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
     APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
     APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
+    APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
 
-    APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1
+    APIR_BACKEND_DISPATCH_TABLE_COUNT = 6, // last command_type index + 1
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index bb9cc99b7262c..c41326eb93ef7 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -53,6 +53,18 @@ vn_cs_decoder_peek(const struct vn_cs_decoder *dec,
   vn_cs_decoder_peek_internal(dec, size, val, val_size);
 }
 
+static inline const void *
+vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec,
+			  size_t size)
+{
+  if (unlikely(size > (size_t) (dec->end - dec->cur))) {
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
+  }
+  const void *addr = dec->cur;
+  dec->cur += size;
+
+  return addr;
+}
 /*
  * read/write
  */
@@ -426,3 +438,17 @@ vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count)
   struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec;
   return vkr_cs_decoder_alloc_array(d, size, count);
 }
+
+/* bool */
+
+static inline void
+vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(int));
+}
+
+static inline void
+vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(int));
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 47227e63d97e4..bd3b5daee46d3 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -33,10 +33,11 @@ static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size
 }
 
 static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  UNUSED(dev);
-  UNUSED(op);
+  IMPLEMENTED;
 
-  return true;
+  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+
+  return apir_device_supports_op(gpu, op);
 }
 
 static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
index 134ca8f58ad1a..dbb42ee75a008 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
@@ -1,5 +1,7 @@
+#include "ggml.h"
 #include "virtgpu.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
 
 #define CACHED
 //  printf("INFO: ### found response in the cache %s\n", __func__)
@@ -179,3 +181,32 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
 
   return;
 }
+
+bool
+apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  vn_encode_ggml_tensor(encoder, op);
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  bool supports_op;
+  vn_decode_bool_t(decoder, &supports_op);
+
+  /* *** */
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return supports_op;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 2edade8f289f1..be1f783dd6c94 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -1,5 +1,8 @@
+struct ggml_tensor;
+
 int apir_device_get_count(struct virtgpu *gpu);
 const char *apir_device_get_name(struct virtgpu *gpu);
 const char *apir_device_get_description(struct virtgpu *gpu);
 uint32_t apir_device_get_type(struct virtgpu *gpu);
 void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
+bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);

From 4a687508691815b9c3924d41e0c682487353c220 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 15:58:25 +0200
Subject: [PATCH 046/117] ggml/src/ggml-remotingbackend/shared/venus_cs.h:
 clearer message when can't read from the decoder

---
 .../ggml-remotingbackend/shared/venus_cs.h    |  2 +-
 .../shared/venus_cs_ggml.h                    | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h

diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index c41326eb93ef7..bf0439e6eee86 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -33,7 +33,7 @@ vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec,
   assert(val_size <= size);
 
   if (unlikely(size > (size_t) (dec->end - dec->cur))) {
-    FATAL("DECODER IS FULL :/");
+    FATAL("READING TOO MUCH FROM THE DECODER :/");
     //vn_cs_decoder_set_fatal(dec);
     memset(val, 0, val_size);
     return false;
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
new file mode 100644
index 0000000000000..96f3bb2aa3346
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -0,0 +1,34 @@
+// needs the ggml.h definition
+// needs venus_cs.h definition
+
+static inline void
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *op) {
+  size_t tensor_size = sizeof(*op);
+
+  if (op->buffer || op->data || op->view_src || op->extra) {
+    FATAL("Cannot pass tensors with data");
+  }
+
+  vn_cs_encoder_write(enc, tensor_size, op, tensor_size);
+
+  for (int i = 0; op->src[i]; i++) {
+    const ggml_tensor *src_op = op->src[i];
+    vn_cs_encoder_write(enc, tensor_size, src_op, tensor_size);
+  }
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_tensor *op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+
+  for (int i = 0; op->src[i]; i++) {
+    ggml_tensor *src_op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    op->src[i] = src_op; // overwrite op->src[i] pointer with the actual location of the src tensor
+  }
+
+  return op;
+}

From 0b77fdeaa3afdb1981098ed832cf75b33ee692fd Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 15:58:57 +0200
Subject: [PATCH 047/117] ggml/src/ggml-remotingfrontend/virtgpu.cpp: make less
 verbose

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 679d8fcae6fe6..58d70ddda28ff 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -410,10 +410,6 @@ remote_call_prepare(
   uint32_t reply_res_id = gpu->reply_shmem->res_id;
   vn_encode_uint32_t(&enc, &reply_res_id);
 
-  printf("%s: prepare %s(flags=0x%x, reply_buf=%d)\n", __func__,
-	 api_remoting_command_name(cmd_type),
-	 cmd_flags, reply_res_id);
-
   return &enc;
 }
 

From 8c81f0f91b1b313c438c91e8f61189bbc2c321df Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 13 May 2025 16:18:07 +0200
Subject: [PATCH 048/117] remoting: reindent and mark functions as
 NOT_IMPLEMENTED

---
 .../ggml-backend-buffer.cpp                   |  36 ++--
 .../ggml-backend-device.cpp                   |   2 +-
 .../ggml-remotingfrontend/ggml-backend.cpp    |  60 +++----
 .../ggml-buffer-type.cpp                      | 158 +++++++++++-------
 .../ggml-host-buffer-type.cpp                 |  64 +++----
 5 files changed, 188 insertions(+), 132 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 638203252a86d..d4cd4e013f66c 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -3,24 +3,32 @@
 #include "ggml-remoting.h"
 
 void ggml_remoting_destroy_buffer(remoting_buffer& buf) {
-    UNUSED(buf);
+  NOT_IMPLEMENTED;
+
+  UNUSED(buf);
 }
 
 static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) {
-    UNUSED(dst);
-    UNUSED(offset);
-    UNUSED(src);
-    UNUSED(size);
+  NOT_IMPLEMENTED;
+
+  UNUSED(dst);
+  UNUSED(offset);
+  UNUSED(src);
+  UNUSED(size);
 }
 
 static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) {
-    UNUSED(src);
-    UNUSED(offset);
-    UNUSED(dst);
-    UNUSED(size);
+  NOT_IMPLEMENTED;
+
+  UNUSED(src);
+  UNUSED(offset);
+  UNUSED(dst);
+  UNUSED(size);
 }
 
 static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) {
+  NOT_IMPLEMENTED;
+
   UNUSED(ctx);
   UNUSED(dst);
   UNUSED(dst_offset);
@@ -32,8 +40,10 @@ static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buff
 static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
 
 static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) {
-    if (tensor->view_src) {
-        return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
-    }
-    return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
+  NOT_IMPLEMENTED;
+
+  if (tensor->view_src) {
+    return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
+  }
+  return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index bd3b5daee46d3..283070079a5c9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -63,7 +63,7 @@ static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, cons
 static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
   UNUSED(dev);
 
-  // NOT_IMPLEMENTED; // too chatty
+  IMPLEMENTED;
 
   return ggml_backend_remoting_host_buffer_type();
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index aac17a762ff9b..61161caa663bd 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -1,54 +1,54 @@
 #include "ggml-remoting.h"
 
 static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
-    UNUSED(backend);
+  UNUSED(backend);
 
-    return "API Remoting backend";
+  return "API Remoting backend";
 }
 
 static void ggml_backend_remoting_free(ggml_backend_t backend) {
-    UNUSED(backend);
+  UNUSED(backend);
 }
 
 static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    UNUSED(backend);
-    UNUSED(cgraph);
+  UNUSED(backend);
+  UNUSED(cgraph);
 
-    return GGML_STATUS_SUCCESS;
+  return GGML_STATUS_SUCCESS;
 }
 
 static ggml_backend_i ggml_backend_remoting_interface = {
-    /* .get_name                = */ ggml_backend_remoting_get_name,
-    /* .free                    = */ ggml_backend_remoting_free,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
-    /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
-    /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
-    /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+  /* .get_name                = */ ggml_backend_remoting_get_name,
+  /* .free                    = */ ggml_backend_remoting_free,
+  /* .set_tensor_async        = */ NULL,  // ggml_backend_remoting_set_tensor_async,
+  /* .get_tensor_async        = */ NULL,  // ggml_backend_remoting_get_tensor_async,
+  /* .cpy_tensor_async        = */ NULL,  // ggml_backend_remoting_cpy_tensor_async,
+  /* .synchronize             = */ NULL,  // ggml_backend_remoting_synchronize,
+  /* .graph_plan_create       = */ NULL,
+  /* .graph_plan_free         = */ NULL,
+  /* .graph_plan_update       = */ NULL,
+  /* .graph_plan_compute      = */ NULL,
+  /* .graph_compute           = */ ggml_backend_remoting_graph_compute,
+  /* .event_record            = */ NULL,
+  /* .event_wait              = */ NULL,
 };
 
 static ggml_guid_t ggml_backend_remoting_guid() {
-    static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-    return &guid;
+  static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+  return &guid;
 }
 
 
 ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
-    UNUSED(params);
-    ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
+  UNUSED(params);
+  ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
 
-    ggml_backend_t remoting_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_remoting_guid(),
-        /* .interface = */ ggml_backend_remoting_interface,
-        /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device),
-        /* .context   = */ ctx,
-    };
+  ggml_backend_t remoting_backend = new ggml_backend {
+    /* .guid      = */ ggml_backend_remoting_guid(),
+    /* .interface = */ ggml_backend_remoting_interface,
+    /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device),
+    /* .context   = */ ctx,
+  };
 
-    return remoting_backend;
+  return remoting_backend;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
index 3d882110b9962..ea0f72fd4dba5 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
@@ -3,79 +3,104 @@
 extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
 struct ggml_backend_remoting_buffer_type_context {
-    std::string name;
+  std::string name;
 };
 
 
 static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
+  UNUSED(buft);
 
-    return "Remoting buffer";
+  NOT_IMPLEMENTED;
+
+  return "Remoting buffer";
 }
 
 static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
+  ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
 
+  NEXT;
+  NOT_IMPLEMENTED;
 
-    return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
-    return 4096;
+  UNUSED(buft);
+
+  NEXT;
+  NOT_IMPLEMENTED;
+
+  return 4096;
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    UNUSED(buft);
-    return 40960;
+  UNUSED(buft);
+
+  NEXT;
+  NOT_IMPLEMENTED;
+
+  return 40960;
 }
 
 static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-    UNUSED(buft);
-    UNUSED(tensor);
-    return ggml_nbytes(tensor);
+  UNUSED(buft);
+  UNUSED(tensor);
+
+  NEXT;
+  NOT_IMPLEMENTED;
+
+  return ggml_nbytes(tensor);
 }
 
 static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
-    /* .is_host          = */ NULL,
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
+  /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
+  /* .is_host          = */ NULL,
 };
 
 ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
 
-    static struct ggml_backend_buffer_type buft {
-      /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
-      /* .device   = */ dev,
-      /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
-    };
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
+  };
 
-    return & buft;
+  return & buft;
 }
 
 static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-    ggml_remoting_destroy_buffer(ctx->dev_buffer);
-    delete ctx;
+  ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  NOT_IMPLEMENTED;
+
+  ggml_remoting_destroy_buffer(ctx->dev_buffer);
+  delete ctx;
 }
 
 static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != nullptr) {
-        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-    }
-    return GGML_STATUS_SUCCESS;
+  NEXT;
+  NOT_IMPLEMENTED;
+  if (tensor->view_src != nullptr) {
+    GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
+  }
+  return GGML_STATUS_SUCCESS;
 }
 
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *) 4096;
+  UNUSED(buffer);
+
+  NEXT;
+  NOT_IMPLEMENTED;
 
-    UNUSED(buffer);
+  return (void *) 4096;
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+  NOT_IMPLEMENTED;
+
   UNUSED(buffer);
   UNUSED(tensor);
   UNUSED(value);
@@ -85,38 +110,45 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf
 
 
 static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+
+  NOT_IMPLEMENTED;
+
 #if 0
-    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-    remoting_buffer buf = buf_ctx->dev_buffer;
+  ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  remoting_buffer buf = buf_ctx->dev_buffer;
 
-    ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+  ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
 #else
-    UNUSED(buffer);
-    UNUSED(tensor);
-    UNUSED(data);
-    UNUSED(offset);
-    UNUSED(size);
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(data);
+  UNUSED(offset);
+  UNUSED(size);
 #endif
 }
 
 static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  NOT_IMPLEMENTED;
+
 #if 0
-    ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
 
-    remoting_buffer buf = buf_ctx->dev_buffer;
+  remoting_buffer buf = buf_ctx->dev_buffer;
 
-    ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+  ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
 #else
-    UNUSED(buffer);
-    UNUSED(tensor);
-    UNUSED(data);
-    UNUSED(offset);
-    UNUSED(size);
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(data);
+  UNUSED(offset);
+  UNUSED(size);
 #endif
 }
 
 
 static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  NOT_IMPLEMENTED;
+
   return true;
 
   UNUSED(buffer);
@@ -125,6 +157,8 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer
 }
 
 static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  NOT_IMPLEMENTED;
+
   UNUSED(dst);
   UNUSED(c);
   UNUSED(size);
@@ -132,6 +166,8 @@ static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uin
 }
 
 static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  NOT_IMPLEMENTED;
+
   UNUSED(ctx);
   UNUSED(dst);
   UNUSED(c);
@@ -140,19 +176,21 @@ static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_bu
 }
 
 static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  NOT_IMPLEMENTED;
+
+  ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
 
-    ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
+  ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
 }
 
 ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
-    /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
-    /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
-    /* .clear           = */ ggml_backend_remoting_buffer_clear,
-    /* .reset           = */ NULL,
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
 };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
index fbf5569788c40..bcbd3fa57f156 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
@@ -3,53 +3,61 @@
 // host buffer type
 
 static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return GGML_REMOTING_FRONTEND_NAME "_Host";
+  UNUSED(buft);
+
+  NOT_IMPLEMENTED;
 
-    UNUSED(buft);
+  return GGML_REMOTING_FRONTEND_NAME "_Host";
 }
 
 static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  UNUSED(buffer);
+  NOT_IMPLEMENTED;
+
 # if 0
-    ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
+  ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
 #endif
-    UNUSED(buffer);
 }
 
 static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  UNUSED(buft);
+
+  NOT_IMPLEMENTED;
 
-    void *ptr = nullptr;
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
+  void *ptr = nullptr;
+  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+  buffer->buft = buft;
+  buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
 
-    return buffer;
-    UNUSED(buft);
+  return buffer;
 }
 
 static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
   UNUSED(buft);
+
+  NOT_IMPLEMENTED;
   return 4096;
 }
 
 // Should be changed to return device-specific host buffer type
 // but that probably requires changes in llama.cpp
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0),
-        /* .context  = */ nullptr,
-    };
-
-    // Make sure device 0 is initialized
-    //ggml_remoting_instance_init();
-    //ggml_remoting_get_device(0);
-
-    return &ggml_backend_remoting_buffer_type_host;
+  static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
+    /* .iface    = */ {
+      /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
+      /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+      /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+      /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+      /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+      /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+    },
+    /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0),
+    /* .context  = */ nullptr,
+  };
+
+  // Make sure device 0 is initialized
+  //ggml_remoting_instance_init();
+  //ggml_remoting_get_device(0);
+
+  return &ggml_backend_remoting_buffer_type_host;
 }

From 319af57a0cacf9bd581753f1172ad1b6acb6a207 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 14 May 2025 14:53:27 +0200
Subject: [PATCH 049/117] Add buffer-type support

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |   2 +
 .../backend-dispatched-buffer-type.cpp        |  57 ++++++++
 .../backend-dispatched-device.cpp             |  90 ++++++++++++
 .../backend-dispatched.cpp                    |  81 ++---------
 .../ggml-remotingbackend/backend-dispatched.h |  59 +++++---
 .../ggml-remotingbackend/backend-internal.h   |  16 +++
 .../shared/apir_backend.h                     |  33 +++--
 .../ggml-remotingbackend/shared/venus_cs.h    |  14 ++
 .../shared/venus_cs_ggml.h                    |  19 ++-
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   3 +-
 .../ggml-backend-device.cpp                   |  57 ++++++--
 .../ggml-remotingfrontend/ggml-backend.cpp    |   9 ++
 .../ggml-buffer-type.cpp                      |  85 +++++------
 .../src/ggml-remotingfrontend/ggml-remoting.h |   8 ++
 .../virtgpu-forward-buffer-type.cpp           | 135 ++++++++++++++++++
 ...forward.cpp => virtgpu-forward-device.cpp} |  30 +++-
 .../ggml-remotingfrontend/virtgpu-forward.h   |  15 +-
 17 files changed, 548 insertions(+), 165 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
 rename ggml/src/ggml-remotingfrontend/{virtgpu-forward.cpp => virtgpu-forward-device.cpp} (88%)

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index 7435c7726beee..fb2504870e6d2 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -6,6 +6,8 @@ message(STATUS "Enable API Remoting backend")
 ggml_add_backend_library(ggml-remotingbackend
                          backend.cpp
                          backend-dispatched.cpp
+                         backend-dispatched-device.cpp
+                         backend-dispatched-buffer-type.cpp
                          backend-utils.cpp
                          shared/api_remoting.h
                          shared/apir_backend.h
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
new file mode 100644
index 0000000000000..979448bd218ab
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -0,0 +1,57 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-remoting-backend.h"
+
+#include "ggml-metal.h"
+
+uint32_t
+backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buft(dec);
+
+  const char *string = buft->iface.get_name(buft);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buft(dec);
+
+  size_t value = buft->iface.get_alignment(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buft(dec);
+
+  size_t value = buft->iface.get_max_size(buft);
+  vn_encode_size_t(enc, &value);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buft(dec);
+
+  bool is_host = buft->iface.is_host(buft);
+  vn_encode_bool_t(enc, &is_host);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
new file mode 100644
index 0000000000000..627aa4685c773
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -0,0 +1,90 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-remoting-backend.h"
+
+#include "ggml-metal.h"
+
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  int32_t dev_count = reg->iface.get_device_count(reg);
+  vn_encode_int32_t(enc, &dev_count);
+
+  return 0;
+}
+
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_name(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  const char *string = dev->iface.get_description(dev);
+
+  const size_t string_size = strlen(string) + 1;
+  vn_encode_array_size(enc, string_size);
+  vn_encode_char_array(enc, string, string_size);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  uint32_t type = dev->iface.get_type(dev);
+  vn_encode_uint32_t(enc, &type);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+
+  vn_encode_size_t(enc, &free);
+  vn_encode_size_t(enc, &total);
+
+  return 0;
+}
+
+uint32_t
+backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
+
+  bool supports_op = dev->iface.supports_op(dev, op);
+
+  vn_encode_bool_t(enc, &supports_op);
+
+  return 0;
+}
+
+uint32_t
+backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
+
+  apir_buffer_type_context_t bufft_ctx = (apir_buffer_type_context_t) bufft;
+  vn_encode_apir_buffer_type_context_t(enc, &bufft_ctx);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index 91d8ac4bd6fc2..bea07682256ac 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -9,9 +9,9 @@
 
 #include "ggml-metal.h"
 
-static ggml_backend_reg_t reg = NULL;
-static ggml_backend_dev_t dev = NULL;
-static ggml_backend_t bck = NULL;
+ggml_backend_reg_t reg = NULL;
+ggml_backend_dev_t dev = NULL;
+ggml_backend_t bck = NULL;
 
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
   if (reg != NULL) {
@@ -41,12 +41,17 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
 
 static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
   UNUSED(reg);
+
+  NOT_IMPLEMENTED;
+
   return 0;
 }
 
 static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
   UNUSED(reg);
 
+  NOT_IMPLEMENTED;
+
   return GGML_REMOTING_BACKEND_NAME;
 }
 
@@ -54,6 +59,8 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
   UNUSED(reg);
   UNUSED(device);
 
+  NOT_IMPLEMENTED;
+
   return NULL;
 }
 
@@ -75,71 +82,3 @@ ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
 
     return &reg;
 }
-
-uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  UNUSED(dec);
-
-  int32_t dev_count = reg->iface.get_device_count(reg);
-  vn_encode_int32_t(enc, &dev_count);
-
-  return 0;
-}
-
-uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  UNUSED(dec);
-
-  const char *string = dev->iface.get_name(dev);
-
-  const size_t string_size = strlen(string) + 1;
-  vn_encode_array_size(enc, string_size);
-  vn_encode_char_array(enc, string, string_size);
-
-  return 0;
-}
-
-uint32_t
-backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  UNUSED(dec);
-
-  const char *string = dev->iface.get_description(dev);
-
-  const size_t string_size = strlen(string) + 1;
-  vn_encode_array_size(enc, string_size);
-  vn_encode_char_array(enc, string, string_size);
-
-  return 0;
-}
-
-uint32_t
-backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  UNUSED(dec);
-
-  uint32_t type = dev->iface.get_type(dev);
-  vn_encode_uint32_t(enc, &type);
-
-  return 0;
-}
-
-uint32_t
-backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  UNUSED(dec);
-
-  size_t free, total;
-  dev->iface.get_memory(dev, &free, &total);
-
-  vn_encode_size_t(enc, &free);
-  vn_encode_size_t(enc, &total);
-
-  return 0;
-}
-
-uint32_t
-backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
-
-  bool supports_op = dev->iface.supports_op(dev, op);
-
-  vn_encode_bool_t(enc, &supports_op);
-
-  return 0;
-}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 4974d5222ddb0..30e3dded013de 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -6,9 +6,10 @@
 #include <ggml-backend.h>
 
 #include "backend-utils.h"
+#include "shared/apir_backend.h"
 #include "shared/venus_cs.h"
 #include "shared/venus_cs_ggml.h"
-#include "shared/apir_backend.h"
+
 
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
 
@@ -17,30 +18,56 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d
 /* *** */
 
 uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+
+/* device */
 uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+
+/* buffer-type */
+uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
-    switch (type) {
-    case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count";
-    case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name";
-    case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
-    case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
-    case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
-    case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
-    default: return "unknown";
-    }
+  switch (type) {
+  /* device */
+  case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count";
+  case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name";
+  case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description";
+  case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
+  case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
+  case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
+
+  /* buffer-type */
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host";
+
+  default: return "unknown";
+  }
 }
 
 static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {
-    [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count,
-    [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name,
-    [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
-    [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
-    [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
-    [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
+  /* device */
+  [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count,
+  [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name,
+  [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description,
+  [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
+  [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
+  [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
+
+  /* buffer-type */
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host,
 };
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
index 8828f08aa1052..7fd803c2aa5dd 100644
--- a/ggml/src/ggml-remotingbackend/backend-internal.h
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -2,6 +2,22 @@
 #include <cstdarg>
 #include <cstdlib>
 
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+extern ggml_backend_reg_t reg;
+extern ggml_backend_dev_t dev;
+
+#define NOT_IMPLEMENTED							\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \
+      first = false;							\
+    }									\
+  } while(0)
+
 extern "C" {
   uint32_t apir_backend_initialize();
   void apir_backend_deinit(void);
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 6949aa5429ca3..0917da7d0e4af 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,22 +14,33 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
+typedef void * apir_buffer_type_context_t;
+
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);
 
 typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
-					    char *dec_cur, const char *dec_end,
-					    char *enc_cur, const char *enc_end,
-					    char **enc_cur_after
+                                            char *dec_cur, const char *dec_end,
+                                            char *enc_cur, const char *enc_end,
+                                            char **enc_cur_after
   );
 
 typedef enum ApirBackendCommandType {
-    APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
-    APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1,
-    APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
-    APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
-    APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
-    APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
-
-    APIR_BACKEND_DISPATCH_TABLE_COUNT = 6, // last command_type index + 1
+  /* device */
+  APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
+  APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1,
+  APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2,
+  APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3,
+  APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
+  APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
+  APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6,
+
+  /* buffer-type */
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 7,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 8,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 9,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 10,
+
+  // last command_type index + 1
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 11,
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index bf0439e6eee86..c796cd3f8e893 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -452,3 +452,17 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
 {
   vn_decode(dec, sizeof(int), val, sizeof(int));
 }
+
+/* apir_buffer_type_context_t */
+
+static inline void
+vn_encode_apir_buffer_type_context_t(struct vn_cs_encoder *enc, const apir_buffer_type_context_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t));
+}
+
+static inline void
+vn_decode_apir_buffer_type_context_t(struct vn_cs_decoder *dec, apir_buffer_type_context_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t));
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 96f3bb2aa3346..4302424aadce0 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -1,4 +1,4 @@
-// needs the ggml.h definition
+// needs the ggml-backend-impl.h definition
 // needs venus_cs.h definition
 
 static inline void
@@ -32,3 +32,20 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
 
   return op;
 }
+
+static inline void
+vn_encode_ggml_buft(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) {
+  size_t buft_ctx_size = sizeof(buft->context);
+
+  vn_cs_encoder_write(enc, buft_ctx_size, &buft->context, buft_ctx_size);
+}
+
+static inline ggml_backend_buffer_type_t
+vn_decode_ggml_buft(struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  size_t buft_size = sizeof(buft);
+
+  vn_cs_decoder_read(dec, buft_size, &buft, buft_size);
+
+  return buft;
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index df45db51f46b3..accdbc473ecc7 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -13,7 +13,8 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu.cpp
                          virtgpu-shm.cpp
                          virtgpu-utils.cpp
-                         virtgpu-forward.cpp
+                         virtgpu-forward-device.cpp
+                         virtgpu-forward-buffer-type.cpp
                          ../../include/ggml-remoting-frontend.h
                         )
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 283070079a5c9..c0c98c8b8a511 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -1,46 +1,55 @@
 #include "ggml-remoting.h"
 
-static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
+#define DEV_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
+
+static const char *
+ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_get_name(gpu);
 }
 
-static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
+static const char *
+ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_get_description(gpu);
 }
 
-static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
+static enum ggml_backend_dev_type
+ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return (enum ggml_backend_dev_type) apir_device_get_type(gpu);
 }
 
-static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+static void
+ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_get_memory(gpu, free, total);
 }
 
-static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+static bool
+ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_supports_op(gpu, op);
 }
 
-static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+static bool
+ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
   UNUSED(dev);
   UNUSED(buft);
 
@@ -49,7 +58,8 @@ static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, g
   return true;
 }
 
-static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+static bool
+ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
   const int min_batch_size = 32;
 
   NOT_IMPLEMENTED;
@@ -60,7 +70,8 @@ static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, cons
   UNUSED(dev);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
   UNUSED(dev);
 
   IMPLEMENTED;
@@ -69,9 +80,10 @@ static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_t
 }
 
 
-static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
-
+static void
+ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
   IMPLEMENTED;
+
   props->name        = ggml_backend_remoting_device_get_name(dev);
   props->description = ggml_backend_remoting_device_get_description(dev);
   props->type        = ggml_backend_remoting_device_get_type(dev);
@@ -84,6 +96,21 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struc
   };
 }
 
+ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ ctx,
+  };
+
+  return &buft;
+}
+
 const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
   /* .get_name             = */ ggml_backend_remoting_device_get_name,
   /* .get_description      = */ ggml_backend_remoting_device_get_description,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 61161caa663bd..6c2f2b947e10b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -3,11 +3,15 @@
 static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
   UNUSED(backend);
 
+  NOT_IMPLEMENTED;
+
   return "API Remoting backend";
 }
 
 static void ggml_backend_remoting_free(ggml_backend_t backend) {
   UNUSED(backend);
+
+  NOT_IMPLEMENTED;
 }
 
 static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
@@ -35,12 +39,17 @@ static ggml_backend_i ggml_backend_remoting_interface = {
 
 static ggml_guid_t ggml_backend_remoting_guid() {
   static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
+
+  NOT_IMPLEMENTED;
+
   return &guid;
 }
 
 
 ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) {
   UNUSED(params);
+  IMPLEMENTED;
+
   ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context;
 
   ggml_backend_t remoting_backend = new ggml_backend {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
index ea0f72fd4dba5..d34904abb1ef0 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
@@ -1,79 +1,70 @@
 #include "ggml-remoting.h"
 
-extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
-
-struct ggml_backend_remoting_buffer_type_context {
-  std::string name;
-};
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
 
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
-static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
+static ggml_backend_buffer_t
+ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  BEING_IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+  UNUSED(gpu);
+  /* ... */
 
-  NOT_IMPLEMENTED;
+  void *ctx = NULL;
 
-  return "Remoting buffer";
+  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
 }
 
-static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-  ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context;
+static const char *
+ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  BEING_IMPLEMENTED;
 
-  NEXT;
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+  return apir_buffer_type_get_name(gpu, buft);
 }
 
-static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
+static size_t
+ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED;
 
-  NEXT;
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return 4096;
+  return apir_buffer_type_get_alignment(gpu, buft);
 }
 
-static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
-
-  NEXT;
-  NOT_IMPLEMENTED;
+static size_t
+ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return 40960;
+  return apir_buffer_type_get_max_size(gpu, buft);
 }
 
-static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
-  UNUSED(buft);
-  UNUSED(tensor);
-
-  NEXT;
-  NOT_IMPLEMENTED;
+static bool
+ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return ggml_nbytes(tensor);
+  return apir_buffer_type_is_host(gpu, buft);
 }
 
-static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
-  /* .get_name         = */ ggml_backend_remoting_buffer_type_name,
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
   /* .alloc_buffer     = */ ggml_backend_remoting_buffer_type_alloc_buffer,
   /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
   /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
-  /* .get_alloc_size   = */ ggml_backend_remoting_buffer_type_get_alloc_size,
-  /* .is_host          = */ NULL,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ ggml_backend_remoting_buffer_type_is_host,
 };
 
-ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
-
-  static struct ggml_backend_buffer_type buft {
-    /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
-    /* .device   = */ dev,
-    /* .context  = */ new ggml_backend_remoting_buffer_type_context{ "device_name"},
-  };
-
-  return & buft;
-}
+/****************************************************************************************/
 
 static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
   ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  NEXT;
   NOT_IMPLEMENTED;
 
   ggml_remoting_destroy_buffer(ctx->dev_buffer);
@@ -183,7 +174,7 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin
   ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
 }
 
-ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
   /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
   /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
   /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 986caef3f407a..8ba40c0b7f7ad 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -19,6 +19,13 @@
     }									\
   } while(0)
 
+#define BEING_IMPLEMENTED							\
+  do {									\
+      printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \
+  } while(0)
+
+#define NEXT
+
 #define IMPLEMENTED
 //  printf("INFO: ### reached implemented function %s\n", __func__)
 
@@ -32,6 +39,7 @@ struct ggml_backend_remoting_device_context {
   struct virtgpu *gpu;
 };
 
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_i;
 
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
new file mode 100644
index 0000000000000..b8a42f7f621b9
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -0,0 +1,135 @@
+#include "ggml-backend-impl.h"
+#include "virtgpu.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
+
+#define CACHED
+//  printf("INFO: ### found response in the cache %s\n", __func__)
+
+
+
+// buffer_type_alloc_buffer
+const char *
+apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  vn_encode_ggml_buft(encoder, buft);
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  const size_t string_size = vn_decode_array_size_unchecked(decoder);
+  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  if (!string) {
+    FATAL("%s: Could not allocate the device name buffer", __func__);
+  }
+  vn_decode_char_array(decoder, string, string_size);
+
+  INFO("%s: Forward BUFT NAME --> %s", __func__, string);
+
+  /* *** */
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return string;
+}
+
+size_t
+apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  static int32_t dev_count = -1;
+  if (dev_count != -1) {
+    CACHED;
+    return dev_count;
+  }
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT;
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  vn_encode_ggml_buft(encoder, buft);
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  size_t alignment;
+  vn_decode_size_t(decoder, &alignment);
+
+  INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return alignment;
+}
+
+size_t
+apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE;
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  vn_encode_ggml_buft(encoder, buft);
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  size_t max_size;
+  vn_decode_size_t(decoder, &max_size);
+
+  INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size);
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return max_size;
+}
+
+bool
+apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  vn_encode_ggml_buft(encoder, buft);
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  bool is_host;
+  vn_decode_bool_t(decoder, &is_host);
+
+  /* *** */
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return is_host;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
similarity index 88%
rename from ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
rename to ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index dbb42ee75a008..1dd303e8c96bf 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-backend-impl.h"
 #include "virtgpu.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
@@ -36,7 +36,6 @@ apir_device_get_count(struct virtgpu *gpu) {
   return dev_count;
 }
 
-
 const char *
 apir_device_get_name(struct virtgpu *gpu) {
   static int32_t dev_count = -1;
@@ -210,3 +209,30 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
 
   return supports_op;
 }
+
+apir_buffer_type_context_t
+apir_device_get_buffer_type(struct virtgpu *gpu) {
+  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE;
+
+  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
+  if (!encoder) {
+    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+  }
+
+  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
+  if (!decoder) {
+    FATAL("%s: failed to kick the remote call :/", __func__);
+  }
+
+  apir_buffer_type_context_t buffer_type_ctx;
+  vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx);
+
+  /* *** */
+
+  int32_t ret = remote_call_finish(encoder, decoder);
+  if (ret != 0) {
+    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
+  }
+
+  return buffer_type_ctx;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index be1f783dd6c94..c484d7eeab8c1 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -1,8 +1,21 @@
-struct ggml_tensor;
+#include "ggml.h"
+#include "ggml-impl.h"
+#include "ggml-alloc.h"
 
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+
+/* device */
 int apir_device_get_count(struct virtgpu *gpu);
 const char *apir_device_get_name(struct virtgpu *gpu);
 const char *apir_device_get_description(struct virtgpu *gpu);
 uint32_t apir_device_get_type(struct virtgpu *gpu);
 void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
 bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
+apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu);
+
+/* buffer-type */
+// buffer_type_alloc_buffer
+const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);

From 73ed5073b722e2fe49ad10532b9530ee8ba3cd03 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 14 May 2025 16:18:27 +0200
Subject: [PATCH 050/117] Keep working

---
 ggml/CMakeLists.txt                           |   3 +-
 ggml/include/ggml-remoting-backend.h          |  16 --
 ggml/src/ggml-backend-reg.cpp                 |   8 +-
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |   1 -
 .../backend-dispatched-buffer-type.cpp        |   1 -
 .../backend-dispatched-device.cpp             |  16 +-
 .../backend-dispatched.cpp                    |  45 -----
 .../ggml-remotingbackend/backend-dispatched.h |   3 +
 ggml/src/ggml-remotingbackend/backend.cpp     |   2 -
 .../shared/apir_backend.h                     |  11 +-
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   2 +-
 .../ggml-backend-device.cpp                   |  47 +++--
 .../ggml-buffer-type.cpp                      |  10 +-
 .../ggml-host-buffer-type.cpp                 |  63 -------
 .../src/ggml-remotingfrontend/ggml-remoting.h |   3 +
 .../virtgpu-forward-buffer-type.cpp           |  91 +++------
 .../virtgpu-forward-device.cpp                | 178 +++++++-----------
 .../virtgpu-forward-impl.h                    |  33 ++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |   5 +
 19 files changed, 194 insertions(+), 344 deletions(-)
 delete mode 100644 ggml/include/ggml-remoting-backend.h
 delete mode 100644 ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 9d7576c911635..cfbd1aca0536f 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -271,8 +271,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
-    ggml/include/ggml-remoting-frontend.h
-    ggml/include/ggml-remoting-backend.h
+    include/ggml-remoting-frontend.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-remoting-backend.h b/ggml/include/ggml-remoting-backend.h
deleted file mode 100644
index 25a9dc269c957..0000000000000
--- a/ggml/include/ggml-remoting-backend.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_REMOTING_BACKEND_NAME "RemotingBackend"
-
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_backend_reg();
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 7e6d4f8c36f67..4f003f0e743e4 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -49,10 +49,6 @@
 #include "ggml-remoting-frontend.h"
 #endif
 
-#ifdef GGML_USE_REMOTINGBACKEND
-#include "ggml-remoting-backend.h"
-#endif
-
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -183,9 +179,7 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_REMOTINGFRONTEND
         register_backend(ggml_backend_remoting_frontend_reg());
 #endif
-#ifdef GGML_USE_REMOTINGBACKEND
-        register_backend(ggml_backend_remoting_backend_reg());
-#endif
+
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index fb2504870e6d2..17ca5e1f53a54 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -12,7 +12,6 @@ ggml_add_backend_library(ggml-remotingbackend
                          shared/api_remoting.h
                          shared/apir_backend.h
                          shared/venus_cs.h
-                         ../../include/ggml-remoting-backend.h
                         )
 
 target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 979448bd218ab..1d17a69f27056 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -5,7 +5,6 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-remoting-backend.h"
 
 #include "ggml-metal.h"
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 627aa4685c773..7062b061defbb 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -5,7 +5,6 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-remoting-backend.h"
 
 #include "ggml-metal.h"
 
@@ -88,3 +87,18 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 
   return 0;
 }
+
+uint32_t
+backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  struct ggml_backend_dev_props props;
+  dev->iface.get_props(dev, &props);
+
+  vn_encode_bool_t(enc, &props.caps.async);
+  vn_encode_bool_t(enc, &props.caps.host_buffer);
+  vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr);
+  vn_encode_bool_t(enc, &props.caps.events);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index bea07682256ac..73be488e6c0f7 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -5,7 +5,6 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-remoting-backend.h"
 
 #include "ggml-metal.h"
 
@@ -38,47 +37,3 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
 
   return APIR_BACKEND_INITIALIZE_SUCCESSS;
 }
-
-static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) {
-  UNUSED(reg);
-
-  NOT_IMPLEMENTED;
-
-  return 0;
-}
-
-static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
-  UNUSED(reg);
-
-  NOT_IMPLEMENTED;
-
-  return GGML_REMOTING_BACKEND_NAME;
-}
-
-static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-  UNUSED(reg);
-  UNUSED(device);
-
-  NOT_IMPLEMENTED;
-
-  return NULL;
-}
-
-static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
-    /* .get_name         = */ ggml_backend_remoting_reg_get_name,
-    /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count,
-    /* .get_device       = */ ggml_backend_remoting_reg_get_device,
-    /* .get_proc_address = */ NULL,
-};
-
-ggml_backend_reg_t ggml_backend_remoting_backend_reg() {
-    static ggml_backend_reg reg = {
-        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .iface       = */ ggml_backend_remoting_reg_i,
-        /* .context     = */ nullptr,
-    };
-
-    INFO("%s, hello :wave:", __func__);
-
-    return &reg;
-}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 30e3dded013de..356742d3ba174 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -26,6 +26,7 @@ uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder
 uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 /* buffer-type */
 uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
@@ -44,6 +45,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory";
   case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
   case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
+  case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props";
 
   /* buffer-type */
   case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
@@ -64,6 +66,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory,
   [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
   [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
+  [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props,
 
   /* buffer-type */
   [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 9a97b97a71f7c..c32353586a10b 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -16,8 +16,6 @@
 
 static void *backend_library_handle = NULL;
 
-
-
 extern "C" {
   void apir_backend_deinit(void) {
     if (backend_library_handle) {
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 0917da7d0e4af..abc20a981ca6b 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -34,13 +34,14 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4,
   APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
   APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6,
+  APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7,
 
   /* buffer-type */
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 7,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 8,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 9,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 10,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 8,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 11,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 12,
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index accdbc473ecc7..5410b80c86f43 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -9,12 +9,12 @@ ggml_add_backend_library(ggml-remotingfrontend
                          ggml-backend-device.cpp
                          ggml-backend-reg.cpp
                          ggml-buffer-type.cpp
-                         ggml-host-buffer-type.cpp
                          virtgpu.cpp
                          virtgpu-shm.cpp
                          virtgpu-utils.cpp
                          virtgpu-forward-device.cpp
                          virtgpu-forward-buffer-type.cpp
+                         virtgpu-forward-impl.h
                          ../../include/ggml-remoting-frontend.h
                         )
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index c0c98c8b8a511..0d955014e0fcf 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -70,34 +70,33 @@ ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tenso
   UNUSED(dev);
 }
 
-static ggml_backend_buffer_type_t
-ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
-  UNUSED(dev);
-
-  IMPLEMENTED;
-
-  return ggml_backend_remoting_host_buffer_type();
-}
-
-
 static void
 ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
   IMPLEMENTED;
 
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
   props->name        = ggml_backend_remoting_device_get_name(dev);
   props->description = ggml_backend_remoting_device_get_description(dev);
   props->type        = ggml_backend_remoting_device_get_type(dev);
   ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
-  props->caps = {
-    /* .async                 = */ false,
-    /* .host_buffer           = */ true,
-    /* .buffer_from_host_ptr  = */ false,
-    /* .events                = */ false,
-  };
+
+  apir_device_get_props(gpu,
+			&props->caps.async,
+			&props->caps.host_buffer,
+			&props->caps.buffer_from_host_ptr,
+			&props->caps.events
+    );
+
+  INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d",
+    __func__, props->caps.async, props->caps.host_buffer,
+       props->caps.buffer_from_host_ptr, props->caps.events);
 }
 
 ggml_backend_buffer_type_t
 ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED;
+
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu);
@@ -111,6 +110,18 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
   return &buft;
 }
 
+static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+  UNUSED(dev);
+  UNUSED(ptr);
+  UNUSED(size);
+  UNUSED(max_tensor_size);
+
+  NOT_IMPLEMENTED;
+  STOP_HERE;
+
+  return nullptr;
+}
+
 const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
   /* .get_name             = */ ggml_backend_remoting_device_get_name,
   /* .get_description      = */ ggml_backend_remoting_device_get_description,
@@ -119,8 +130,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
   /* .get_props            = */ ggml_backend_remoting_device_get_props,
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
-  /* .buffer_from_host_ptr = */ NULL,
+  /* .get_host_buffer_type = */ NULL,
+  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
   /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
index d34904abb1ef0..4882904759566 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
@@ -10,7 +10,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   BEING_IMPLEMENTED;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
   UNUSED(gpu);
-  /* ... */
 
   void *ctx = NULL;
 
@@ -19,7 +18,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
 
 static const char *
 ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-  BEING_IMPLEMENTED;
+  IMPLEMENTED;
 
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
@@ -72,11 +71,12 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
 }
 
 static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+  UNUSED(buffer);
+  UNUSED(tensor);
+
   NEXT;
   NOT_IMPLEMENTED;
-  if (tensor->view_src != nullptr) {
-    GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-  }
+
   return GGML_STATUS_SUCCESS;
 }
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
deleted file mode 100644
index bcbd3fa57f156..0000000000000
--- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "ggml-remoting.h"
-
-// host buffer type
-
-static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
-
-  NOT_IMPLEMENTED;
-
-  return GGML_REMOTING_FRONTEND_NAME "_Host";
-}
-
-static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-  UNUSED(buffer);
-  NOT_IMPLEMENTED;
-
-# if 0
-  ggml_remoting_host_free(remoting_instance.devices[0], buffer->context);
-#endif
-}
-
-static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-  UNUSED(buft);
-
-  NOT_IMPLEMENTED;
-
-  void *ptr = nullptr;
-  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-  buffer->buft = buft;
-  buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
-
-  return buffer;
-}
-
-static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-  UNUSED(buft);
-
-  NOT_IMPLEMENTED;
-  return 4096;
-}
-
-// Should be changed to return device-specific host buffer type
-// but that probably requires changes in llama.cpp
-ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() {
-  static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = {
-    /* .iface    = */ {
-      /* .get_name         = */ ggml_backend_remoting_host_buffer_type_name,
-      /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
-      /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
-      /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-      /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-      /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-    },
-    /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0),
-    /* .context  = */ nullptr,
-  };
-
-  // Make sure device 0 is initialized
-  //ggml_remoting_instance_init();
-  //ggml_remoting_get_device(0);
-
-  return &ggml_backend_remoting_buffer_type_host;
-}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 8ba40c0b7f7ad..2230622abf35b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -26,6 +26,9 @@
 
 #define NEXT
 
+#define STOP_HERE \
+  thks_bye()
+
 #define IMPLEMENTED
 //  printf("INFO: ### reached implemented function %s\n", __func__)
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index b8a42f7f621b9..4c2a7b6c4de75 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -1,29 +1,16 @@
-#include "ggml-backend-impl.h"
-#include "virtgpu.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
-
-#define CACHED
-//  printf("INFO: ### found response in the cache %s\n", __func__)
-
-
+#include "virtgpu-forward-impl.h"
 
 // buffer_type_alloc_buffer
 const char *
 apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME;
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
 
   vn_encode_ggml_buft(encoder, buft);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   const size_t string_size = vn_decode_array_size_unchecked(decoder);
   char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
@@ -36,100 +23,68 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft)
 
   /* *** */
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return string;
 }
 
 size_t
 apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
-  static int32_t dev_count = -1;
-  if (dev_count != -1) {
-    CACHED;
-    return dev_count;
-  }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT;
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
 
   vn_encode_ggml_buft(encoder, buft);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   size_t alignment;
   vn_decode_size_t(decoder, &alignment);
 
   INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return alignment;
 }
 
 size_t
 apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE;
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
 
   vn_encode_ggml_buft(encoder, buft);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   size_t max_size;
   vn_decode_size_t(decoder, &max_size);
 
   INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return max_size;
 }
 
 bool
 apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST;
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
 
   vn_encode_ggml_buft(encoder, buft);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   bool is_host;
   vn_decode_bool_t(decoder, &is_host);
 
-  /* *** */
-
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return is_host;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 1dd303e8c96bf..d25081f0d1634 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -1,10 +1,4 @@
-#include "ggml-backend-impl.h"
-#include "virtgpu.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
-
-#define CACHED
-//  printf("INFO: ### found response in the cache %s\n", __func__)
+#include "virtgpu-forward-impl.h"
 
 int
 apir_device_get_count(struct virtgpu *gpu) {
@@ -13,50 +7,37 @@ apir_device_get_count(struct virtgpu *gpu) {
     CACHED;
     return dev_count;
   }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_COUNT;
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT);
+  REMOTE_CALL(gpu, encoder, decoder);
 
   vn_decode_int32_t(decoder, &dev_count);
 
   INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return dev_count;
 }
 
 const char *
 apir_device_get_name(struct virtgpu *gpu) {
-  static int32_t dev_count = -1;
-  if (dev_count != -1) {
+  static char *string = nullptr;
+  if (string) {
     CACHED;
-    return "Nothing";
-  }
-
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_NAME;
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
+    return string;
   }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME);
+  REMOTE_CALL(gpu, encoder, decoder);
 
   const size_t string_size = vn_decode_array_size_unchecked(decoder);
-  char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
+  string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
   if (!string) {
     FATAL("%s: Could not allocate the device name buffer", __func__);
   }
@@ -64,31 +45,19 @@ apir_device_get_name(struct virtgpu *gpu) {
 
   INFO("%s: Forward DEV NAME --> %s", __func__, string);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return string;
 }
 
 const char *
 apir_device_get_description(struct virtgpu *gpu) {
-  static int32_t dev_count = -1;
-  if (dev_count != -1) {
-    CACHED;
-    return "Nothing";
-  }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION;
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION);
+
+  REMOTE_CALL(gpu, encoder, decoder);
 
   const size_t string_size = vn_decode_array_size_unchecked(decoder);
   char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size);
@@ -99,10 +68,7 @@ apir_device_get_description(struct virtgpu *gpu) {
 
   INFO("%s: Forward DEV DESCR --> %s", __func__, string);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return string;
 }
@@ -114,26 +80,19 @@ apir_device_get_type(struct virtgpu *gpu) {
     CACHED;
     return dev_type;
   }
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_TYPE;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE);
+
+  REMOTE_CALL(gpu, encoder, decoder);
 
   vn_decode_uint32_t(decoder, &dev_type);
 
   INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return dev_type;
 }
@@ -152,17 +111,12 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
     return;
   }
   */
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_MEMORY;
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   vn_decode_size_t(decoder, &dev_free);
   vn_decode_size_t(decoder, &dev_total);
@@ -173,66 +127,72 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
   INFO("%s: Forward DEV FREE  mem --> %zu MB", __func__, dev_free / 1024 / 1024);
   INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024);
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return;
 }
 
 bool
 apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP;
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
 
   vn_encode_ggml_tensor(encoder, op);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+
+  REMOTE_CALL(gpu, encoder, decoder);
 
   bool supports_op;
   vn_decode_bool_t(decoder, &supports_op);
 
   /* *** */
 
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return supports_op;
 }
 
 apir_buffer_type_context_t
 apir_device_get_buffer_type(struct virtgpu *gpu) {
-  int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE;
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
 
-  struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag);
-  if (!encoder) {
-    FATAL("%s: failed to prepare the remote call encoder :/", __func__);
-  }
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE);
 
-  struct vn_cs_decoder *decoder = remote_call(gpu, encoder);
-  if (!decoder) {
-    FATAL("%s: failed to kick the remote call :/", __func__);
-  }
+  REMOTE_CALL(gpu, encoder, decoder);
 
   apir_buffer_type_context_t buffer_type_ctx;
   vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx);
 
   /* *** */
-
-  int32_t ret = remote_call_finish(encoder, decoder);
-  if (ret != 0) {
-    FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret);
-  }
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return buffer_type_ctx;
 }
+
+void
+apir_device_get_props(struct virtgpu *gpu,
+		      bool *async,
+		      bool *host_buffer,
+		      bool *buffer_from_host_ptr,
+		      bool *events) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_bool_t(decoder, async);
+  vn_decode_bool_t(decoder, host_buffer);
+  vn_decode_bool_t(decoder, buffer_from_host_ptr);
+  vn_decode_bool_t(decoder, events);
+
+  /* *** */
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
new file mode 100644
index 0000000000000..4f9af992d70c9
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
@@ -0,0 +1,33 @@
+#include "ggml-backend-impl.h"
+#include "virtgpu.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
+
+#define CACHED
+//  printf("INFO: ### found response in the cache %s\n", __func__)o
+
+
+#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__)		\
+  do {									\
+    int32_t forward_flag = (int32_t) apir_command_type__;		\
+    encoder_name = remote_call_prepare(gpu_dev_name, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); \
+    if (!encoder) {							\
+      FATAL("%s: failed to prepare the remote call encoder :/", __func__); \
+    }									\
+  } while(0)
+
+#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name) \
+  do {							      \
+    decoder_name = remote_call(gpu_dev_name, encoder_name);   \
+    if (!decoder) {					      \
+      FATAL("%s: failed to kick the remote call :/", __func__); \
+    }								      \
+  } while(0)
+
+#define REMOTE_CALL_FINISH(gpu_dev_name, encoder_name, decoder_name)	\
+  do {									\
+    int32_t ret = remote_call_finish(encoder_name, decoder_name);	\
+    if (ret != 0) {							\
+      FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); \
+    }									\
+  } while(0)
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index c484d7eeab8c1..5a9b3c15c82ba 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -12,6 +12,11 @@ uint32_t apir_device_get_type(struct virtgpu *gpu);
 void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
 bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
 apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu);
+void apir_device_get_props(struct virtgpu *gpu,
+			   bool *async,
+			   bool *host_buffer,
+			   bool *buffer_from_host_ptr,
+			   bool *events);
 
 /* buffer-type */
 // buffer_type_alloc_buffer

From 88e8ec3d9562c717ae2d4546f227ca0b6a73d88f Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 15 May 2025 10:10:11 +0200
Subject: [PATCH 051/117] Keep working on buffer types and buffers

---
 .../shared/apir_backend.h                     |  7 ++-
 .../ggml-remotingbackend/shared/venus_cs.h    | 10 ++--
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |  3 +-
 ...-type.cpp => ggml-backend-buffer-type.cpp} | 18 ++++--
 .../ggml-backend-device.cpp                   | 32 +++++++++--
 .../ggml-backend-host-buffer-type.cpp         | 56 +++++++++++++++++++
 .../ggml-backend-reg.cpp                      |  2 +-
 .../src/ggml-remotingfrontend/ggml-remoting.h |  3 +-
 .../virtgpu-forward-buffer-type.cpp           | 15 +++++
 .../virtgpu-forward-device.cpp                |  9 ++-
 .../ggml-remotingfrontend/virtgpu-forward.h   |  5 +-
 11 files changed, 131 insertions(+), 29 deletions(-)
 rename ggml/src/ggml-remotingfrontend/{ggml-buffer-type.cpp => ggml-backend-buffer-type.cpp} (92%)
 create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp

diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index abc20a981ca6b..644fae7938379 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,7 +14,8 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
-typedef void * apir_buffer_type_context_t;
+typedef uintptr_t apir_buffer_type_handle_t;
+typedef uintptr_t apir_buffer_handle_t;
 
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);
@@ -41,7 +42,9 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9,
   APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10,
   APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12,
+  APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 12,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 14,
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index c796cd3f8e893..bc9048f44e315 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -453,16 +453,16 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
   vn_decode(dec, sizeof(int), val, sizeof(int));
 }
 
-/* apir_buffer_type_context_t */
+/* apir_buffer_type_handle_t */
 
 static inline void
-vn_encode_apir_buffer_type_context_t(struct vn_cs_encoder *enc, const apir_buffer_type_context_t *val)
+vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_handle_t *val)
 {
-  vn_encode(enc, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t));
+  vn_encode(enc, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
 }
 
 static inline void
-vn_decode_apir_buffer_type_context_t(struct vn_cs_decoder *dec, apir_buffer_type_context_t *val)
+vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_handle_t *val)
 {
-  vn_decode(dec, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t));
+  vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
 }
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 5410b80c86f43..a2b3277584b38 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -8,7 +8,8 @@ ggml_add_backend_library(ggml-remotingfrontend
                          ggml-backend.cpp
                          ggml-backend-device.cpp
                          ggml-backend-reg.cpp
-                         ggml-buffer-type.cpp
+                         ggml-backend-buffer-type.cpp
+                         ggml-backend-host-buffer-type.cpp
                          virtgpu.cpp
                          virtgpu-shm.cpp
                          virtgpu-utils.cpp
diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
similarity index 92%
rename from ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
rename to ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 4882904759566..22f962ec27579 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -3,6 +3,9 @@
 #define BUFT_TO_GPU(name) \
   ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
 
+#define BUFFER_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->dev->context)->gpu
+
 extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
 static ggml_backend_buffer_t
@@ -11,9 +14,9 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
   UNUSED(gpu);
 
-  void *ctx = NULL;
+  apir_buffer_handle_t handle = apir_buffer_type_alloc_buffer(gpu, size);
 
-  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) handle, size);
 }
 
 static const char *
@@ -76,17 +79,20 @@ static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_bu
 
   NEXT;
   NOT_IMPLEMENTED;
-
+  STOP_HERE;
   return GGML_STATUS_SUCCESS;
 }
 
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   UNUSED(buffer);
+  BEING_IMPLEMENTED;
+
+  STOP_HERE;
+  return NULL;
+  //struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  NEXT;
-  NOT_IMPLEMENTED;
 
-  return (void *) 4096;
+  //return apir_buffer_get_base(gpu, (ggml_backend_buffer_t)buffer->context);
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 0d955014e0fcf..9a72139b4d2ed 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -88,7 +88,12 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
 			&props->caps.events
     );
 
-  INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d",
+  // ignore the actual backend answers and set it as we provide it in
+  // the API Remoting frontend
+  props->caps.host_buffer = true;
+  props->caps.buffer_from_host_ptr = false;
+
+  INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
     __func__, props->caps.async, props->caps.host_buffer,
        props->caps.buffer_from_host_ptr, props->caps.events);
 }
@@ -99,12 +104,12 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
-  apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu);
+  apir_buffer_type_handle_t ctx = apir_device_get_buffer_type(gpu);
 
   static struct ggml_backend_buffer_type buft {
     /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
     /* .device   = */ dev,
-    /* .context  = */ ctx,
+    /* .context  = */ (void *) ctx,
   };
 
   return &buft;
@@ -122,7 +127,22 @@ static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_b
   return nullptr;
 }
 
-const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
+static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+
+    static struct ggml_backend_buffer_type host_bufft = {
+      /* .iface    = */ ggml_backend_remoting_host_buffer_type_interface,
+      /* .device   = */ dev,
+      /* .context  = */ nullptr,
+    };
+
+    // Make sure device 0 is initialized
+    //ggml_remoting_instance_init();
+    //ggml_remoting_get_device(0);
+
+    return &host_bufft;
+}
+
+const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
   /* .get_name             = */ ggml_backend_remoting_device_get_name,
   /* .get_description      = */ ggml_backend_remoting_device_get_description,
   /* .get_memory           = */ ggml_backend_remoting_device_get_memory,
@@ -130,8 +150,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_i = {
   /* .get_props            = */ ggml_backend_remoting_device_get_props,
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-  /* .get_host_buffer_type = */ NULL,
-  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+  /* .buffer_from_host_ptr = */ NULL,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
   /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
new file mode 100644
index 0000000000000..3aef4b86e2b6a
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -0,0 +1,56 @@
+#include "ggml-remoting.h"
+
+#define BUFT_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
+
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+  BEING_IMPLEMENTED;
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
+  UNUSED(gpu);
+
+  void *ctx = NULL;
+
+  NOT_IMPLEMENTED;
+
+  STOP_HERE;
+  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+}
+
+static const char *
+ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED;
+
+  return "GUEST host buffer";
+}
+
+static size_t
+ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  NOT_IMPLEMENTED;
+
+  return 4096;
+}
+
+static bool
+ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  NOT_IMPLEMENTED;
+
+  return true;
+}
+
+const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_remoting_host_buffer_type_get_name,
+    /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
+    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+    /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+  };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 06bcb0310cbc6..eeac6c59db670 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -77,7 +77,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
 	ctx->gpu = gpu;
 
         devices.push_back(new ggml_backend_device {
-            /* .iface   = */ ggml_backend_remoting_device_i,
+            /* .iface   = */ ggml_backend_remoting_device_interface,
             /* .reg     = */ reg,
             /* .context = */ ctx,
           });
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 2230622abf35b..ecdfcc1f31384 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -43,7 +43,8 @@ struct ggml_backend_remoting_device_context {
 };
 
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
-extern const struct ggml_backend_device_i ggml_backend_remoting_device_i;
+extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
 
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
 ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index 4c2a7b6c4de75..39c205edacef0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -88,3 +88,18 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
 
   return is_host;
 }
+
+apir_buffer_handle_t
+apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size) {
+  UNUSED(gpu);
+  UNUSED(size);
+
+  return 0;
+}
+
+void *
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
+  UNUSED(gpu);
+  UNUSED(buffer_handle);
+  return NULL;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index d25081f0d1634..7c241d71a1679 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -155,7 +155,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
   return supports_op;
 }
 
-apir_buffer_type_context_t
+apir_buffer_type_handle_t
 apir_device_get_buffer_type(struct virtgpu *gpu) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
@@ -164,13 +164,12 @@ apir_device_get_buffer_type(struct virtgpu *gpu) {
 
   REMOTE_CALL(gpu, encoder, decoder);
 
-  apir_buffer_type_context_t buffer_type_ctx;
-  vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx);
+  apir_buffer_type_handle_t buft_handle;
+  vn_decode_apir_buffer_type_handle_t(decoder, &buft_handle);
 
-  /* *** */
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  return buffer_type_ctx;
+  return buft_handle;
 }
 
 void
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 5a9b3c15c82ba..521029c3bee9e 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -11,7 +11,7 @@ const char *apir_device_get_description(struct virtgpu *gpu);
 uint32_t apir_device_get_type(struct virtgpu *gpu);
 void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
 bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
-apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu);
+apir_buffer_type_handle_t apir_device_get_buffer_type(struct virtgpu *gpu);
 void apir_device_get_props(struct virtgpu *gpu,
 			   bool *async,
 			   bool *host_buffer,
@@ -19,8 +19,9 @@ void apir_device_get_props(struct virtgpu *gpu,
 			   bool *events);
 
 /* buffer-type */
-// buffer_type_alloc_buffer
 const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
+apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size);
+void *apir_buffer_get_base(struct virtgpu *gpu, ggml_backend_buffer_t buffer);

From 43af3a093de35e33daa2ea51fdfdd80e64f2b604 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 15 May 2025 14:11:59 +0200
Subject: [PATCH 052/117] implemnt alloc_buffer and get_base

---
 .../backend-dispatched-buffer-type.cpp        | 28 +++++++++
 .../backend-dispatched-device.cpp             |  4 +-
 .../ggml-remotingbackend/backend-dispatched.h | 11 ++++
 .../ggml-remotingbackend/shared/venus_cs.h    | 28 +++++++++
 .../shared/venus_cs_ggml.h                    | 41 ++++++++++---
 .../ggml-backend-buffer-type.cpp              | 32 +++++-----
 .../src/ggml-remotingfrontend/ggml-remoting.h | 23 ++-----
 .../virtgpu-forward-buffer-type.cpp           | 60 +++++++++++++++----
 .../ggml-remotingfrontend/virtgpu-forward.h   |  4 +-
 9 files changed, 175 insertions(+), 56 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 1d17a69f27056..cceec68064742 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -54,3 +54,31 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
 
   return 0;
 }
+
+uint32_t
+backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_type_t buft;
+  buft = vn_decode_ggml_buft(dec);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size);
+  apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer;
+  vn_encode_ggml_buffer_handle(enc, buffer_handle);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+  vn_encode_uintptr_t(enc, &base);
+
+  INFO("%s: send base %p\n", __func__,  (void *) base);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 7062b061defbb..2db2e75816258 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -82,8 +82,8 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 
   ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
 
-  apir_buffer_type_context_t bufft_ctx = (apir_buffer_type_context_t) bufft;
-  vn_encode_apir_buffer_type_context_t(enc, &bufft_ctx);
+  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) bufft;
+  vn_encode_apir_buffer_type_handle_t(enc, &buft_handle);
 
   return 0;
 }
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 356742d3ba174..26e2762bf72b5 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -33,6 +33,10 @@ uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_de
 uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+
+/* buffer */
+uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
@@ -52,7 +56,10 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment";
   case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size";
   case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host";
+  case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer";
 
+  /* buffer */
+  case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
   default: return "unknown";
   }
 }
@@ -73,4 +80,8 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment,
   [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size,
   [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host,
+  [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer,
+
+  /* buffer */
+  [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index bc9048f44e315..d2b85c8f82196 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -466,3 +466,31 @@ vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_
 {
   vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
 }
+
+/* apir_buffer_handle_t */
+
+static inline void
+vn_encode_apir_buffer_handle_t(struct vn_cs_encoder *enc, const apir_buffer_handle_t *val)
+{
+  vn_encode(enc, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t));
+}
+
+static inline void
+vn_decode_apir_buffer_handle_t(struct vn_cs_decoder *dec, apir_buffer_handle_t *val)
+{
+  vn_decode(dec, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t));
+}
+
+/* uintptr_t */
+
+static inline void
+vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val)
+{
+  vn_encode(enc, sizeof(*val), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val)
+{
+  vn_decode(dec, sizeof(*val), val, sizeof(*val));
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 4302424aadce0..a587cad3b23bf 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -33,19 +33,44 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
   return op;
 }
 
-static inline void
-vn_encode_ggml_buft(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) {
-  size_t buft_ctx_size = sizeof(buft->context);
+/* *** ggml_backend_buffer_type_t *** */
+
+// ggml_backend_buffer_type_t is a POINTER (to a struct).
+// Only the host pointer is shared between the host and guest.
+// The guest stores it in `buft->context`.
+// The host simply writes the pointer address in the buffer variable.
 
-  vn_cs_encoder_write(enc, buft_ctx_size, &buft->context, buft_ctx_size);
+
+static inline void
+vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, apir_buffer_type_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), handle, sizeof(*handle));
 }
 
 static inline ggml_backend_buffer_type_t
 vn_decode_ggml_buft(struct vn_cs_decoder *dec) {
-  ggml_backend_buffer_type_t buft;
-  size_t buft_size = sizeof(buft);
+  apir_buffer_type_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return (ggml_backend_buffer_type_t) handle;
+}
+
+/* *** ggml_backend_type_t *** */
+
+// ggml_backend_buffer_t is a POINTER.
+// same logic as for ggml_backend_buffer_type_t
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_t buffer;
+  size_t buffer_ptr_size = sizeof(buffer);
 
-  vn_cs_decoder_read(dec, buft_size, &buft, buft_size);
+  vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size);
 
-  return buft;
+  return buffer;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 22f962ec27579..bc22310d277bf 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -4,19 +4,24 @@
   ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
 
 #define BUFFER_TO_GPU(name) \
-  ((struct ggml_backend_remoting_device_context *) (name)->dev->context)->gpu
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
 
 extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
 static ggml_backend_buffer_t
 ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-  BEING_IMPLEMENTED;
+  IMPLEMENTED;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
-  UNUSED(gpu);
 
-  apir_buffer_handle_t handle = apir_buffer_type_alloc_buffer(gpu, size);
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size);
 
-  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) handle, size);
+  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
 }
 
 static const char *
@@ -69,7 +74,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
   NEXT;
   NOT_IMPLEMENTED;
 
-  ggml_remoting_destroy_buffer(ctx->dev_buffer);
+  //ggml_remoting_destroy_buffer(ctx->dev_buffer);
   delete ctx;
 }
 
@@ -85,14 +90,11 @@ static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_bu
 
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   UNUSED(buffer);
-  BEING_IMPLEMENTED;
-
-  STOP_HERE;
-  return NULL;
-  //struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+  IMPLEMENTED;
 
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  //return apir_buffer_get_base(gpu, (ggml_backend_buffer_t)buffer->context);
+  return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle);
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -175,9 +177,11 @@ static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_bu
 static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
   NOT_IMPLEMENTED;
 
-  ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
+  UNUSED(buffer);
+  UNUSED(value);
+  //ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
 
-  ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
+  //ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index ecdfcc1f31384..49ab2f34e0530 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -42,6 +42,12 @@ struct ggml_backend_remoting_device_context {
   struct virtgpu *gpu;
 };
 
+struct ggml_backend_remoting_buffer_context {
+  apir_buffer_handle_t handle;
+
+  struct virtgpu *gpu;
+};
+
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
@@ -61,23 +67,6 @@ struct remoting_device_struct;
 typedef std::shared_ptr<remoting_device_struct> remoting_device;
 typedef std::weak_ptr<remoting_device_struct> remoting_device_ref;
 
-struct ggml_backend_remoting_buffer_context {
-  remoting_device_ref device;
-  remoting_buffer dev_buffer;
-  std::string name;
-
-  ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) :
-    name(name) {
-    UNUSED(device);
-    UNUSED(dev_buffer);
-  }
-
-  ~ggml_backend_remoting_buffer_context() {
-    ggml_remoting_destroy_buffer(dev_buffer);
-  }
-};
-
-
 struct remoting_context_struct {
   int i;
 };
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index 39c205edacef0..f072f0cac81a7 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -1,6 +1,5 @@
 #include "virtgpu-forward-impl.h"
 
-// buffer_type_alloc_buffer
 const char *
 apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
   struct vn_cs_encoder *encoder;
@@ -8,7 +7,8 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft)
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
 
-  vn_encode_ggml_buft(encoder, buft);
+  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -35,7 +35,8 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
 
-  vn_encode_ggml_buft(encoder, buft);
+  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -56,7 +57,8 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
 
-  vn_encode_ggml_buft(encoder, buft);
+  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -77,7 +79,8 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
 
-  vn_encode_ggml_buft(encoder, buft);
+  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -90,16 +93,47 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
 }
 
 apir_buffer_handle_t
-apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size) {
-  UNUSED(gpu);
-  UNUSED(size);
+apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  INFO("%s: allocate device memory (%lu)\n", __func__,  size);
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
+
+  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_handle_t(encoder, &buft_handle);
+
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  apir_buffer_handle_t buffer_handle;
+  vn_decode_apir_buffer_handle_t(decoder, &buffer_handle);
+  INFO("%s: received buffer handle %p\n", __func__,  (void *) buffer_handle);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  return 0;
+  return buffer_handle;
 }
 
 void *
-apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
-  UNUSED(gpu);
-  UNUSED(buffer_handle);
-  return NULL;
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+  vn_encode_apir_buffer_handle_t(encoder, &handle);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  uintptr_t base;
+  vn_decode_uintptr_t(decoder, &base);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  INFO("%s: received base %p\n", __func__,  (void *) base);
+
+  return (void *) base;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 521029c3bee9e..dda345d27c574 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -23,5 +23,5 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t
 size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
-apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size);
-void *apir_buffer_get_base(struct virtgpu *gpu, ggml_backend_buffer_t buffer);
+apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size);
+void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle);

From 25f8d24d7c6e138382193994dd65a7d170c3851c Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 15 May 2025 14:29:16 +0200
Subject: [PATCH 053/117] buffer: clean ups

---
 .../ggml-backend-buffer-type.cpp              | 126 ------------------
 .../ggml-backend-buffer.cpp                   |  99 +++++++++++---
 .../src/ggml-remotingfrontend/ggml-remoting.h |   1 +
 3 files changed, 81 insertions(+), 145 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index bc22310d277bf..3a3d445958504 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -3,11 +3,6 @@
 #define BUFT_TO_GPU(name) \
   ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu
 
-#define BUFFER_TO_GPU(name) \
-  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
-
-extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
-
 static ggml_backend_buffer_t
 ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
   IMPLEMENTED;
@@ -73,125 +68,4 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
   ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
   NEXT;
   NOT_IMPLEMENTED;
-
-  //ggml_remoting_destroy_buffer(ctx->dev_buffer);
-  delete ctx;
-}
-
-static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-  UNUSED(buffer);
-  UNUSED(tensor);
-
-  NEXT;
-  NOT_IMPLEMENTED;
-  STOP_HERE;
-  return GGML_STATUS_SUCCESS;
-}
-
-static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
-  UNUSED(buffer);
-  IMPLEMENTED;
-
-  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
-
-  return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle);
-}
-
-static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
-  NOT_IMPLEMENTED;
-
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(value);
-  UNUSED(offset);
-  UNUSED(size);
-}
-
-
-static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-
-  NOT_IMPLEMENTED;
-
-#if 0
-  ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-  remoting_buffer buf = buf_ctx->dev_buffer;
-
-  ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
-#else
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(data);
-  UNUSED(offset);
-  UNUSED(size);
-#endif
-}
-
-static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-  NOT_IMPLEMENTED;
-
-#if 0
-  ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-
-  remoting_buffer buf = buf_ctx->dev_buffer;
-
-  ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
-#else
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(data);
-  UNUSED(offset);
-  UNUSED(size);
-#endif
 }
-
-
-static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-  NOT_IMPLEMENTED;
-
-  return true;
-
-  UNUSED(buffer);
-  UNUSED(src);
-  UNUSED(dst);
-}
-
-static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  NOT_IMPLEMENTED;
-
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
-static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  NOT_IMPLEMENTED;
-
-  UNUSED(ctx);
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
-static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-  NOT_IMPLEMENTED;
-
-  UNUSED(buffer);
-  UNUSED(value);
-  //ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-
-  //ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size);
-}
-
-const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
-  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
-  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
-  /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
-  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
-  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
-  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
-  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
-  /* .clear           = */ ggml_backend_remoting_buffer_clear,
-  /* .reset           = */ NULL,
-};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index d4cd4e013f66c..25e4ed47c29a0 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -1,49 +1,110 @@
-#include <memory>
-
 #include "ggml-remoting.h"
 
-void ggml_remoting_destroy_buffer(remoting_buffer& buf) {
+#define BUFFER_TO_GPU(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
+
+static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+  UNUSED(buffer);
+  UNUSED(tensor);
+
+  NEXT;
   NOT_IMPLEMENTED;
+  STOP_HERE;
+  return GGML_STATUS_SUCCESS;
+}
+
+static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
+  UNUSED(buffer);
+  IMPLEMENTED;
 
-  UNUSED(buf);
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle);
 }
 
-static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) {
+static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
   NOT_IMPLEMENTED;
 
-  UNUSED(dst);
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(value);
   UNUSED(offset);
-  UNUSED(src);
   UNUSED(size);
 }
 
-static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) {
+
+static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+
   NOT_IMPLEMENTED;
 
-  UNUSED(src);
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(data);
   UNUSED(offset);
+  UNUSED(size);
+}
+
+static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  NOT_IMPLEMENTED;
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  UNUSED(data);
+  UNUSED(offset);
+  UNUSED(size);
+}
+
+
+static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+  NOT_IMPLEMENTED;
+
+  return true;
+
+  UNUSED(buffer);
+  UNUSED(src);
+  UNUSED(dst);
+}
+
+static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
+  NOT_IMPLEMENTED;
+
   UNUSED(dst);
+  UNUSED(c);
   UNUSED(size);
+  UNUSED(offset);
 }
 
-static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) {
+static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
   NOT_IMPLEMENTED;
 
   UNUSED(ctx);
   UNUSED(dst);
-  UNUSED(dst_offset);
-  UNUSED(src);
-  UNUSED(src_offset);
+  UNUSED(c);
   UNUSED(size);
+  UNUSED(offset);
 }
 
-static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+  UNUSED(buffer);
+  UNUSED(value);
 
-static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) {
   NOT_IMPLEMENTED;
+}
+
+static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  UNUSED(buffer);
 
-  if (tensor->view_src) {
-    return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base;
-  }
-  return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base;
+  NOT_IMPLEMENTED;
 }
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 49ab2f34e0530..8072c0e356d48 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -51,6 +51,7 @@ struct ggml_backend_remoting_buffer_context {
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
 ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);

From db107bb35fff93df18814a87a6f7b718873303e2 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 15 May 2025 16:58:25 +0200
Subject: [PATCH 054/117] Keep working on buffers

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |  1 +
 .../ggml-remotingbackend/backend-convert.h    |  7 ++
 .../backend-dispatched-buffer-type.cpp        | 15 ----
 .../backend-dispatched-buffer.cpp             | 20 ++++++
 .../backend-dispatched-device.cpp             |  2 -
 .../ggml-remotingbackend/backend-dispatched.h |  3 +-
 .../shared/apir_backend.h                     |  2 +
 .../shared/venus_cs_ggml.h                    | 69 +++++++++++++++----
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |  1 +
 .../ggml-backend-buffer-type.cpp              |  6 --
 .../ggml-backend-buffer.cpp                   | 20 ++----
 .../ggml-backend-device.cpp                   |  6 +-
 .../ggml-backend-host-buffer-type.cpp         | 18 ++++-
 .../src/ggml-remotingfrontend/ggml-remoting.h | 15 +++-
 .../virtgpu-forward-buffer-type.cpp           | 23 +------
 .../virtgpu-forward-buffer.cpp                | 22 ++++++
 .../virtgpu-forward-impl.h                    |  1 +
 .../ggml-remotingfrontend/virtgpu-forward.h   |  8 ++-
 18 files changed, 155 insertions(+), 84 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/backend-convert.h
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index 17ca5e1f53a54..feca344c90a64 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -7,6 +7,7 @@ ggml_add_backend_library(ggml-remotingbackend
                          backend.cpp
                          backend-dispatched.cpp
                          backend-dispatched-device.cpp
+                         backend-dispatched-buffer.cpp
                          backend-dispatched-buffer-type.cpp
                          backend-utils.cpp
                          shared/api_remoting.h
diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
new file mode 100644
index 0000000000000..e7d875cde7ee8
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -0,0 +1,7 @@
+#include "shared/apir_backend.h"
+
+static inline apir_buffer_handle_t
+ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_handle_t) buffer;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index cceec68064742..da8a50d67ccb7 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -6,8 +6,6 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
-#include "ggml-metal.h"
-
 uint32_t
 backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
   ggml_backend_buffer_type_t buft;
@@ -69,16 +67,3 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
 
   return 0;
 }
-
-uint32_t
-backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
-  ggml_backend_buffer_t buffer;
-  buffer = vn_decode_ggml_buffer(dec);
-
-  uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
-  vn_encode_uintptr_t(enc, &base);
-
-  INFO("%s: send base %p\n", __func__,  (void *) base);
-
-  return 0;
-}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
new file mode 100644
index 0000000000000..095a95f1a6fae
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -0,0 +1,20 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t
+backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
+  vn_encode_uintptr_t(enc, &base);
+
+  //INFO("%s: send base %p\n", __func__,  (void *) base);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 2db2e75816258..21b603a3160b6 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -6,8 +6,6 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
-#include "ggml-metal.h"
-
 uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
   UNUSED(dec);
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 26e2762bf72b5..460d3a1af4e05 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -6,11 +6,11 @@
 #include <ggml-backend.h>
 
 #include "backend-utils.h"
+#include "backend-convert.h"
 #include "shared/apir_backend.h"
 #include "shared/venus_cs.h"
 #include "shared/venus_cs_ggml.h"
 
-
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
 
 typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
@@ -60,6 +60,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
 
   /* buffer */
   case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
+
   default: return "unknown";
   }
 }
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 644fae7938379..08433c014f2a3 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -43,6 +43,8 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10,
   APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11,
   APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12,
+
+  /* buffer */
   APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13,
 
   // last command_type index + 1
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index a587cad3b23bf..637a0a8368aad 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -1,19 +1,45 @@
 // needs the ggml-backend-impl.h definition
 // needs venus_cs.h definition
 
+// needs
+// ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer);
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle);
+
+static inline ggml_backend_buffer_t
+vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
+
 static inline void
-vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *op) {
-  size_t tensor_size = sizeof(*op);
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  size_t tensor_size = sizeof(*tensor);
 
-  if (op->buffer || op->data || op->view_src || op->extra) {
-    FATAL("Cannot pass tensors with data");
+  if (tensor->view_src) {
+    FATAL("Cannot pass tensors with view_src");
+  }
+  if (tensor->extra) {
+    FATAL("Cannot pass tensors with extra");
+  }
+
+  if (tensor->src[0] && tensor->buffer) {
+    // not sure if the buffer needs to be updated inside the src tensors or not
+    FATAL("Cannot pass tensors with src and buffer");
   }
 
-  vn_cs_encoder_write(enc, tensor_size, op, tensor_size);
+  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
 
-  for (int i = 0; op->src[i]; i++) {
-    const ggml_tensor *src_op = op->src[i];
-    vn_cs_encoder_write(enc, tensor_size, src_op, tensor_size);
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+  // (could also make a copy of the tensor, and update locally.)
+
+  if (tensor->buffer) {
+    apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    const ggml_tensor *src_tensor = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, src_tensor, tensor_size);
   }
 }
 
@@ -22,15 +48,20 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
 
   // it safe to remove the `const` qualifier here, we *do* want to
   // modify the shared memory data to fix the `src` pointers.
-  ggml_tensor *op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
 
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+  if (tensor->buffer) {
+    tensor->buffer = vn_decode_ggml_buffer(dec);
+  }
 
-  for (int i = 0; op->src[i]; i++) {
-    ggml_tensor *src_op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
-    op->src[i] = src_op; // overwrite op->src[i] pointer with the actual location of the src tensor
+  for (int i = 0; tensor->src[i]; i++) {
+    ggml_tensor *src_tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = src_tensor; // overwrite op->src[i] pointer with the actual location of the src tensor
   }
 
-  return op;
+  return tensor;
 }
 
 /* *** ggml_backend_buffer_type_t *** */
@@ -74,3 +105,15 @@ vn_decode_ggml_buffer(struct vn_cs_decoder *dec) {
 
   return buffer;
 }
+
+/* enum ggml_status */
+
+static inline void
+vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) {
+  vn_cs_encoder_write(enc, sizeof(*status), &status, sizeof(*status));
+}
+
+static inline void
+vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) {
+  vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status));
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index a2b3277584b38..b77a0254a7a6c 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -15,6 +15,7 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu-utils.cpp
                          virtgpu-forward-device.cpp
                          virtgpu-forward-buffer-type.cpp
+                         virtgpu-forward-buffer.cpp
                          virtgpu-forward-impl.h
                          ../../include/ggml-remoting-frontend.h
                         )
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 3a3d445958504..bb326570d975c 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -63,9 +63,3 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
 };
 
 /****************************************************************************************/
-
-static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-  ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context;
-  NEXT;
-  NOT_IMPLEMENTED;
-}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 25e4ed47c29a0..27a8efdea0d7e 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -3,28 +3,19 @@
 #define BUFFER_TO_GPU(name) \
   ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
 
-static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-  UNUSED(buffer);
-  UNUSED(tensor);
-
-  NEXT;
-  NOT_IMPLEMENTED;
-  STOP_HERE;
-  return GGML_STATUS_SUCCESS;
-}
-
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
-  UNUSED(buffer);
-  IMPLEMENTED;
+  //IMPLEMENTED;
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle);
+  return apir_buffer_get_base(gpu, BUFFER_TO_HANDLE(buffer));
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
   NOT_IMPLEMENTED;
 
+  STOP_HERE;
+
   UNUSED(buffer);
   UNUSED(tensor);
   UNUSED(value);
@@ -34,7 +25,6 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf
 
 
 static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-
   NOT_IMPLEMENTED;
 
   UNUSED(buffer);
@@ -100,7 +90,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
   /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
   /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
-  /* .init_tensor     = */ ggml_backend_remoting_buffer_init_tensor,
+  /* .init_tensor     = */ NULL,
   /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
   /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor,
   /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 9a72139b4d2ed..a7d0d9eb69c5d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -41,7 +41,7 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
 
 static bool
 ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  IMPLEMENTED;
+  //IMPLEMENTED;
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
@@ -135,9 +135,7 @@ static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_t
       /* .context  = */ nullptr,
     };
 
-    // Make sure device 0 is initialized
-    //ggml_remoting_instance_init();
-    //ggml_remoting_get_device(0);
+    //IMPLEMENTED;
 
     return &host_bufft;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
index 3aef4b86e2b6a..847a1ec0500fc 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -33,6 +33,7 @@ ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t
   UNUSED(buft);
 
   NOT_IMPLEMENTED;
+  STOP_HERE;
 
   return 4096;
 }
@@ -41,16 +42,27 @@ static bool
 ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
   UNUSED(buft);
 
-  NOT_IMPLEMENTED;
+  IMPLEMENTED;
+  STOP_HERE;
 
   return true;
 }
 
+static size_t
+ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+  UNUSED(buft);
+
+  IMPLEMENTED;
+  STOP_HERE;
+
+  return SIZE_MAX;
+}
+
 const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_remoting_host_buffer_type_get_name,
     /* .alloc_buffer     = */ ggml_backend_remoting_host_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_remoting_host_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_max_size     = */ ggml_backend_remoting_host_buffer_type_get_max_size,
     /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-    /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+    /* .is_host          = */ ggml_backend_remoting_host_buffer_type_is_host,
   };
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 8072c0e356d48..dc184c300f24d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -10,6 +10,9 @@
 #include "ggml-backend.h"
 #include "virtgpu.h"
 
+#define BUFFER_TO_HANDLE(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle
+
 #define NOT_IMPLEMENTED							\
   do {									\
     static bool first = true;						\
@@ -29,8 +32,8 @@
 #define STOP_HERE \
   thks_bye()
 
-#define IMPLEMENTED
-//  printf("INFO: ### reached implemented function %s\n", __func__)
+#define IMPLEMENTED \
+  printf("INFO: ### reached implemented function %s\n", __func__)
 
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
@@ -48,6 +51,14 @@ struct ggml_backend_remoting_buffer_context {
   struct virtgpu *gpu;
 };
 
+static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+
+//  return buffer?0:1;
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
+
+  return context->handle;
+}
+
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index f072f0cac81a7..a8d5b351688ff 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -87,6 +87,8 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
   bool is_host;
   vn_decode_bool_t(decoder, &is_host);
 
+  INFO("%s: buffer is host? %d", __func__, is_host);
+
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return is_host;
@@ -116,24 +118,3 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   return buffer_handle;
 }
-
-void *
-apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle) {
-  struct vn_cs_encoder *encoder;
-  struct vn_cs_decoder *decoder;
-
-  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
-
-  vn_encode_apir_buffer_handle_t(encoder, &handle);
-
-  REMOTE_CALL(gpu, encoder, decoder);
-
-  uintptr_t base;
-  vn_decode_uintptr_t(decoder, &base);
-
-  REMOTE_CALL_FINISH(gpu, encoder, decoder);
-
-  INFO("%s: received base %p\n", __func__,  (void *) base);
-
-  return (void *) base;
-}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
new file mode 100644
index 0000000000000..4ccadf98f1d7f
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -0,0 +1,22 @@
+#include "virtgpu-forward-impl.h"
+
+void *
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
+
+  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  uintptr_t base;
+  vn_decode_uintptr_t(decoder, &base);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  //INFO("%s: received base %p\n", __func__,  (void *) base);
+
+  return (void *) base;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
index 4f9af992d70c9..a7ed708851d8f 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
@@ -1,4 +1,5 @@
 #include "ggml-backend-impl.h"
+#include "ggml-remoting.h"
 #include "virtgpu.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index dda345d27c574..074af30275621 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -23,5 +23,9 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t
 size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
-apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size);
-void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle);
+apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size);
+
+/* buffer */
+
+void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle);
+enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor);

From 248f69590d475e04d2f2d3eb1ef7bc8e5d1036b1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 16 May 2025 10:59:21 +0200
Subject: [PATCH 055/117] build.backend: build llama-run

---
 build.backend.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.backend.sh b/build.backend.sh
index b32c24b9ba035..086f7a4577ddd 100755
--- a/build.backend.sh
+++ b/build.backend.sh
@@ -4,7 +4,7 @@ rm -f READY_backend FAILED_backend
 echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
 export LD_PRELOAD=/tmp/isatty.so
 
-cmake --build ../build.remoting-backend --parallel 8 --target llama-cli "$@"
+cmake --build ../build.remoting-backend --parallel 8 --target llama-run "$@"
 
 if [[ $? == 0 ]]; then
     touch READY_backend

From 2e70ad0965387831b5b35304a9cc335c08400128 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 16 May 2025 10:59:42 +0200
Subject: [PATCH 056/117] ggml: src: ggml-remotingbackend/shared/venus_cs: fix
 memory corruption caused by vn_decode ...

---
 ggml/src/ggml-remotingbackend/shared/venus_cs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index d2b85c8f82196..c8149a5b58a29 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -444,13 +444,13 @@ vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count)
 static inline void
 vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val)
 {
-  vn_encode(enc, sizeof(int), val, sizeof(int));
+  vn_encode(enc, sizeof(int), val, sizeof(bool));
 }
 
 static inline void
 vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
 {
-  vn_decode(dec, sizeof(int), val, sizeof(int));
+  vn_decode(dec, sizeof(int), val, sizeof(bool));
 }
 
 /* apir_buffer_type_handle_t */

From 4d7d6dc0e4ce44fc811b8afeb26fe091207d3bbf Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 16 May 2025 11:00:30 +0200
Subject: [PATCH 057/117] ggml: src: ggml-remotingfrontend/ggml-backend-device:
 handcode the caps

---
 .../src/ggml-remotingfrontend/ggml-backend-device.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index a7d0d9eb69c5d..ef48bd6fae96e 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -74,24 +74,27 @@ static void
 ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
   IMPLEMENTED;
 
-  struct virtgpu *gpu = DEV_TO_GPU(dev);
-
   props->name        = ggml_backend_remoting_device_get_name(dev);
   props->description = ggml_backend_remoting_device_get_description(dev);
   props->type        = ggml_backend_remoting_device_get_type(dev);
   ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
+#if 0
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
   apir_device_get_props(gpu,
 			&props->caps.async,
 			&props->caps.host_buffer,
 			&props->caps.buffer_from_host_ptr,
 			&props->caps.events
     );
-
+#else
   // ignore the actual backend answers and set it as we provide it in
   // the API Remoting frontend
-  props->caps.host_buffer = true;
+  props->caps.async = false;
+  props->caps.host_buffer = false;
   props->caps.buffer_from_host_ptr = false;
+  props->caps.events = false;
+#endif
 
   INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
     __func__, props->caps.async, props->caps.host_buffer,

From 6f0578fd2a0002a3c62527a4bdb5444d6fb1c40e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 16 May 2025 14:14:53 +0200
Subject: [PATCH 058/117] remoting: implement buffer_set_tensor

---
 .../backend-dispatched-buffer.cpp             | 28 +++++++++++++++++++
 .../ggml-remotingbackend/backend-dispatched.h |  3 ++
 .../shared/apir_backend.h                     |  3 +-
 .../ggml-backend-buffer.cpp                   | 14 ++++++----
 .../virtgpu-forward-buffer.cpp                | 23 +++++++++++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |  2 ++
 6 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index 095a95f1a6fae..85b313f52b58e 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -18,3 +18,31 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
 
   return 0;
 }
+
+uint32_t
+backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec);
+
+  void *data;
+  vn_decode_uintptr_t(dec, (uintptr_t *) &data);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+       buffer, tensor, data, offset, size);
+
+  //buffer->iface.set_tensor(buffer, tensor, data, offset, size);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 460d3a1af4e05..ce8cbc98eea24 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -37,6 +37,7 @@ uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_c
 
 /* buffer */
 uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
@@ -60,6 +61,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
 
   /* buffer */
   case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
+  case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor";
 
   default: return "unknown";
   }
@@ -85,4 +87,5 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
 
   /* buffer */
   [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base,
+  [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 08433c014f2a3..cbc181c0089d4 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -46,7 +46,8 @@ typedef enum ApirBackendCommandType {
 
   /* buffer */
   APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13,
+  APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 14,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 15,
 } ApirBackendCommandType;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 27a8efdea0d7e..aa0730efa55b9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -25,13 +25,15 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf
 
 
 static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-  NOT_IMPLEMENTED;
+  BEING_IMPLEMENTED;
 
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(data);
-  UNUSED(offset);
-  UNUSED(size);
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size);
+
+  apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size);
+
+  return;
 }
 
 static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 4ccadf98f1d7f..550a849dcd2f0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -20,3 +20,26 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
 
   return (void *) base;
 }
+
+void
+apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu");
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
+
+  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+  vn_encode_uintptr_t(encoder, (uintptr_t *) &data);
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 074af30275621..2790adbb62454 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -29,3 +29,5 @@ apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_bac
 
 void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle);
 enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor);
+void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+			    ggml_tensor *tensor, const void *data, size_t offset, size_t size);

From 6f396ccc658c3ecc9cfa039114f209b5372a1b19 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Fri, 16 May 2025 14:15:06 +0200
Subject: [PATCH 059/117] remoting: improve

---
 .../ggml-remotingfrontend/ggml-backend-buffer-type.cpp |  2 +-
 ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp |  2 +-
 ggml/src/ggml-remotingfrontend/ggml-backend.cpp        | 10 +++++-----
 .../virtgpu-forward-buffer-type.cpp                    |  5 ++---
 .../ggml-remotingfrontend/virtgpu-forward-buffer.cpp   |  3 ++-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index bb326570d975c..631db50b309cc 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -21,7 +21,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
 
 static const char *
 ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-  IMPLEMENTED;
+  //IMPLEMENTED;
 
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index aa0730efa55b9..069886358f6f0 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -58,7 +58,7 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer
 }
 
 static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  NOT_IMPLEMENTED;
+  BEING_IMPLEMENTED;
 
   UNUSED(dst);
   UNUSED(c);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 6c2f2b947e10b..4bd321b5fc5c9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -9,15 +9,17 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_remoting_free(ggml_backend_t backend) {
-  UNUSED(backend);
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  delete backend;
 }
 
 static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
   UNUSED(backend);
   UNUSED(cgraph);
 
+  NOT_IMPLEMENTED;
+
   return GGML_STATUS_SUCCESS;
 }
 
@@ -38,9 +40,7 @@ static ggml_backend_i ggml_backend_remoting_interface = {
 };
 
 static ggml_guid_t ggml_backend_remoting_guid() {
-  static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
-
-  NOT_IMPLEMENTED;
+  static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
 
   return &guid;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index a8d5b351688ff..645780715a133 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -19,7 +19,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft)
   }
   vn_decode_char_array(decoder, string, string_size);
 
-  INFO("%s: Forward BUFT NAME --> %s", __func__, string);
+  //INFO("%s: Forward BUFT NAME --> %s", __func__, string);
 
   /* *** */
 
@@ -99,7 +99,7 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
-  INFO("%s: allocate device memory (%lu)\n", __func__,  size);
+  INFO("%s: allocate device memory (%lu)", __func__,  size);
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
 
@@ -112,7 +112,6 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   apir_buffer_handle_t buffer_handle;
   vn_decode_apir_buffer_handle_t(decoder, &buffer_handle);
-  INFO("%s: received buffer handle %p\n", __func__,  (void *) buffer_handle);
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 550a849dcd2f0..ad65804ab27b0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -27,7 +27,8 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
-  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu");
+  INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
+    buffer_handle, tensor, data, offset, size);
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
 

From d40a3da85fc6c3eab5bfe386fa5a366c5ab6c2ff Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:21:46 +0200
Subject: [PATCH 060/117] remotingbackend: accept the virgl context argument

---
 .../backend-dispatched-buffer-type.cpp        | 15 ++++++---
 .../backend-dispatched-buffer.cpp             |  6 ++--
 .../backend-dispatched-device.cpp             | 25 ++++++++++-----
 .../ggml-remotingbackend/backend-dispatched.h | 32 +++++++++----------
 .../ggml-remotingbackend/backend-internal.h   |  2 +-
 ggml/src/ggml-remotingbackend/backend.cpp     |  4 +--
 .../shared/apir_backend.h                     | 16 +++++++++-
 .../shared/venus_cs_ggml.h                    | 12 +++++++
 8 files changed, 77 insertions(+), 35 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index da8a50d67ccb7..f09592ea5df43 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -7,7 +7,8 @@
 #include "ggml-backend.h"
 
 uint32_t
-backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
 
@@ -21,7 +22,8 @@ backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *de
 }
 
 uint32_t
-backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
 
@@ -32,7 +34,8 @@ backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decode
 }
 
 uint32_t
-backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
 
@@ -43,7 +46,8 @@ backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder
 }
 
 uint32_t
-backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
 
@@ -54,7 +58,8 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
 }
 
 uint32_t
-backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index 85b313f52b58e..ff35a492cf100 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -7,7 +7,8 @@
 #include "ggml-backend.h"
 
 uint32_t
-backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   ggml_backend_buffer_t buffer;
   buffer = vn_decode_ggml_buffer(dec);
 
@@ -20,7 +21,8 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
 }
 
 uint32_t
-backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(enc);
 
   ggml_backend_buffer_t buffer;
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 21b603a3160b6..ba2ec479a95c0 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -6,7 +6,9 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
-uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(ctx);
   UNUSED(dec);
 
   int32_t dev_count = reg->iface.get_device_count(reg);
@@ -15,7 +17,8 @@ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_de
   return 0;
 }
 
-uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   const char *string = dev->iface.get_name(dev);
@@ -28,7 +31,8 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder
 }
 
 uint32_t
-backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   const char *string = dev->iface.get_description(dev);
@@ -41,7 +45,8 @@ backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 }
 
 uint32_t
-backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   uint32_t type = dev->iface.get_type(dev);
@@ -51,7 +56,8 @@ backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
 }
 
 uint32_t
-backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   size_t free, total;
@@ -64,7 +70,8 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec)
 }
 
 uint32_t
-backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
 
   bool supports_op = dev->iface.supports_op(dev, op);
@@ -75,7 +82,8 @@ backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec)
 }
 
 uint32_t
-backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
@@ -87,7 +95,8 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 }
 
 uint32_t
-backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) {
+backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
   UNUSED(dec);
 
   struct ggml_backend_dev_props props;
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index ce8cbc98eea24..faa3dacfc2297 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -13,31 +13,31 @@
 
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p);
 
-typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* *** */
 
-uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* device */
-uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* buffer-type */
-uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* buffer */
-uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
-uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec);
+uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
index 7fd803c2aa5dd..5c29e18d4596a 100644
--- a/ggml/src/ggml-remotingbackend/backend-internal.h
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -21,7 +21,7 @@ extern ggml_backend_dev_t dev;
 extern "C" {
   uint32_t apir_backend_initialize();
   void apir_backend_deinit(void);
-  uint32_t apir_backend_dispatcher(uint32_t cmd_type,
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
 				   char *dec_cur, const char *dec_end,
 				   char *enc_cur, const char *enc_end,
 				   char **enc_cur_after);
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index c32353586a10b..c9d784941d514 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -58,7 +58,7 @@ extern "C" {
     return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
   }
 
-  uint32_t apir_backend_dispatcher(uint32_t cmd_type,
+  uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx,
 				   char *dec_cur, const char *dec_end,
 				   char *enc_cur, const char *enc_end,
 				   char **enc_cur_after) {
@@ -82,7 +82,7 @@ extern "C" {
     }
 
     backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
-    uint32_t ret = forward_fct(enc, dec);
+    uint32_t ret = forward_fct(enc, dec, ctx);
 
     *enc_cur_after = enc->cur;
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index cbc181c0089d4..96bbb59fda14c 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -20,7 +20,10 @@ typedef uintptr_t apir_buffer_handle_t;
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);
 
-typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type,
+struct vn_dispatch_context;
+struct virgl_apir_context;
+
+typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, struct virgl_apir_context *ctx,
                                             char *dec_cur, const char *dec_end,
                                             char *enc_cur, const char *enc_end,
                                             char **enc_cur_after
@@ -51,3 +54,14 @@ typedef enum ApirBackendCommandType {
   // last command_type index + 1
   APIR_BACKEND_DISPATCH_TABLE_COUNT = 15,
 } ApirBackendCommandType;
+
+
+struct virgl_apir_callbacks {
+  void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id);
+} ;
+
+struct virgl_apir_context {
+  struct vn_dispatch_context *virgl_ctx;
+
+  struct virgl_apir_callbacks iface;
+};
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 637a0a8368aad..8a73537a45204 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -117,3 +117,15 @@ static inline void
 vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) {
   vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status));
 }
+
+/* vn_renderer_shmem */
+
+static inline void
+vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) {
+  vn_encode_uint32_t(enc, &shmem_res_id);
+}
+
+static inline void
+vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) {
+  vn_decode_uint32_t(dec, shmem_res_id);
+}

From b815c1a3c5dcb1c41f991857699e7e59b4707389 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:23:31 +0200
Subject: [PATCH 061/117] remotingfrontend: implement buffer_set_tensor with a
 guest shared page

---
 .../ggml-backend-buffer.cpp                     | 15 +++++++++++----
 .../virtgpu-forward-buffer.cpp                  | 17 ++++++++++++++---
 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp  |  1 +
 ggml/src/ggml-remotingfrontend/virtgpu-shm.h    |  2 ++
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 069886358f6f0..25f6e78436d8c 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -23,14 +23,21 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf
   UNUSED(size);
 }
 
-
 static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-  BEING_IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
-
+#if 0
   INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size);
-
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t)data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
   apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size);
 
   return;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index ad65804ab27b0..dc991f84c07cc 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -21,20 +21,29 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
   return (void *) base;
 }
 
+
 void
 apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
-
+#if 0
   INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
     buffer_handle, tensor, data, offset, size);
-
+#endif
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
 
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
   vn_encode_ggml_tensor(encoder, tensor);
-  vn_encode_uintptr_t(encoder, (uintptr_t *) &data);
+
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+  if (!shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+
+  memcpy(shmem->mmap_ptr, data, size);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+
   vn_encode_size_t(encoder, &offset);
   vn_encode_size_t(encoder, &size);
 
@@ -42,5 +51,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
+  virtgpu_shmem_destroy(gpu, shmem->shmem);
+
   return;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
index bd1568add1752..d5e602c97be66 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -102,6 +102,7 @@ virtgpu_shmem_create(struct virtgpu *gpu, size_t size)
    shmem->base.mmap_ptr = ptr;
    shmem->base.refcount.count = 1;
    shmem->base.gem_handle = gem_handle;
+   shmem->base.shmem = shmem;
 
    return &shmem->base;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
index 3bdc5ca700f1b..e5770b1916886 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
@@ -25,6 +25,8 @@ struct vn_renderer_shmem {
    int64_t cache_timestamp;
 
    uint32_t gem_handle;
+
+   struct virtgpu_shmem *shmem;
 };
 
 struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size);

From b24fbe79623cca592f26293f999b7e1a527afed3 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:23:54 +0200
Subject: [PATCH 062/117] ggml: src:
 ggml-remotingbackend/backend-dispatched-buffer: implement buffer_set_tensor
 with the guest shared page

---
 .../backend-dispatched-buffer.cpp             | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index ff35a492cf100..c217cecbd2aa0 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -32,8 +32,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   // safe to remove the const qualifier here
   tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec);
 
-  void *data;
-  vn_decode_uintptr_t(dec, (uintptr_t *) &data);
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
 
   size_t offset;
   vn_decode_size_t(dec, &offset);
@@ -41,10 +41,26 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   size_t size;
   vn_decode_size_t(dec, &size);
 
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+#if 0
   INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
-       buffer, tensor, data, offset, size);
+       buffer, tensor, shmem_data, offset, size);
+#endif
+#if 0
+  void **addr = (void **)(uintptr_t)shmem_data;
+  for (int i = 0; i <= 10; i++) {
+    INFO("%s: %p | %llx", __func__, addr, *addr);
+    addr++;
+  }
+  INFO("\n");
+#endif
 
-  //buffer->iface.set_tensor(buffer, tensor, data, offset, size);
+  buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
 
   return 0;
 }

From 3a20164a1f163a58454fda5df2c8dee324613d42 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:24:16 +0200
Subject: [PATCH 063/117] remotingfrontend: add more STOP_HERE calls

---
 ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 25f6e78436d8c..847a61297be8b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -46,6 +46,8 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer
 static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
   NOT_IMPLEMENTED;
 
+  STOP_HERE;
+
   UNUSED(buffer);
   UNUSED(tensor);
   UNUSED(data);
@@ -57,6 +59,8 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
 static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
   NOT_IMPLEMENTED;
 
+  STOP_HERE;
+
   return true;
 
   UNUSED(buffer);
@@ -76,6 +80,8 @@ static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uin
 static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
   NOT_IMPLEMENTED;
 
+  STOP_HERE;
+
   UNUSED(ctx);
   UNUSED(dst);
   UNUSED(c);
@@ -88,12 +94,16 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin
   UNUSED(value);
 
   NOT_IMPLEMENTED;
+
+  STOP_HERE;
 }
 
 static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
   UNUSED(buffer);
 
   NOT_IMPLEMENTED;
+
+    STOP_HERE;
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {

From c5608716989673f9d8cce82f2734917801cf2a78 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:24:30 +0200
Subject: [PATCH 064/117] remotingfrontend: add IMPLEMENTED_ONCE

---
 ggml/src/ggml-remotingfrontend/ggml-remoting.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index dc184c300f24d..ecc1e98217378 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -32,9 +32,18 @@
 #define STOP_HERE \
   thks_bye()
 
-#define IMPLEMENTED \
+#define IMPLEMENTED							\
   printf("INFO: ### reached implemented function %s\n", __func__)
 
+#define IMPLEMENTED_ONCE						\
+  do {									\
+    static bool first = true;						\
+    if (first) {							\
+      printf("INFO: ### reached implemented function %s\n", __func__);  \
+      first = false;							\
+    }									\
+  } while(0)
+
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
 struct ggml_backend_remoting_device_context {

From 142924b3d173df4c473816610d40c2e44425d02a Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 11:24:43 +0200
Subject: [PATCH 065/117] ggml: src: ggml-remotingfrontend/virtgpu-shm: reduce
 the verbosity

---
 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
index d5e602c97be66..935b1028d2ab0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -53,7 +53,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
       .handle = gem_handle,
       .pad = 0,
    };
-   printf("virtgpu_ioctl_map(%ld)\n", size);
+
    if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args))
       return NULL;
 
@@ -61,7 +61,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size)
                     args.offset);
    if (ptr == MAP_FAILED)
       return NULL;
-   printf("virtgpu_ioctl_map(%ld) --> %p | %p\n", size, ptr, *(void **)ptr);
+
    return ptr;
 }
 

From 9913b7f34e075348e43ccf887d696fdca9da3046 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 14:54:30 +0200
Subject: [PATCH 066/117] ggml: src: ggml-remotingfrontend/ggml-backend-reg:
 refactor to untight the reg and the device

---
 .../ggml-backend-reg.cpp                      | 40 ++++++++++++-------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index eeac6c59db670..8b5eb5bbb189b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -40,24 +40,27 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg)
 
   IMPLEMENTED;
 
-  struct virtgpu *gpu = apir_initialize();
-  if (!gpu) {
-    WARNING("apir_initialize failed :/");
-    return 0;
-  }
-
-  return apir_device_get_count(gpu);
+  return ggml_backend_remoting_get_device_count();
 }
 
-static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-  static std::vector<ggml_backend_dev_t> devices;
+static std::vector<ggml_backend_dev_t> devices;
 
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) {
+  GGML_ASSERT(device < devices.size());
+  return devices[device];
+}
+
+static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
   IMPLEMENTED;
 
+  if (devices.size() > 0) {
+    INFO("%s: already initialized\n", __func__);
+  }
+
   struct virtgpu *gpu = apir_initialize();
   if (!gpu) {
-    WARNING("apir_initialize failed :/");
-    return 0;
+    FATAL("apir_initialize failed :/");
+    return;
   }
 
   static bool initialized = false;
@@ -67,7 +70,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
     std::lock_guard<std::mutex> lock(mutex);
     if (!initialized) {
 
-      for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) {
+      for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) {
         ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context;
         char desc[256] = "API Remoting device";
 
@@ -85,9 +88,14 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_
       initialized = true;
     }
   }
+}
 
-  GGML_ASSERT(device < devices.size());
-  return devices[device];
+static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+  UNUSED(reg);
+
+  IMPLEMENTED;
+
+  return ggml_backend_remoting_get_device(device);
 }
 
 static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) {
@@ -109,6 +117,7 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
     FATAL("apir_initialize failed :/");
     return NULL;
   }
+
   static ggml_backend_reg reg = {
     /* .api_version = */ GGML_BACKEND_API_VERSION,
     /* .iface       = */ ggml_backend_remoting_reg_i,
@@ -116,5 +125,8 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
   };
 
   RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
+
+  ggml_backend_remoting_reg_init_devices(&reg);
+
   return &reg;
 }

From ede86288480281adef9ea8fff8685d88e7398e93 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 14:57:13 +0200
Subject: [PATCH 067/117] ggml: src: ggml-remotingfrontend/ggml-remoting:
 remove draft code

---
 ggml/src/ggml-remotingfrontend/ggml-remoting.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index ecc1e98217378..0d8912741ba0b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -61,8 +61,6 @@ struct ggml_backend_remoting_buffer_context {
 };
 
 static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
-
-//  return buffer?0:1;
   struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
 
   return context->handle;

From 1927cf0a4a4cd71cee7858a1c0d77b729d1a292c Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 19 May 2025 15:13:45 +0200
Subject: [PATCH 068/117] remotingfrontend: add host buffer memory allocation

---
 .../ggml-backend-host-buffer-type.cpp         | 56 +++++++++++++++++--
 .../src/ggml-remotingfrontend/ggml-remoting.h |  6 ++
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
index 847a1ec0500fc..faf051fcc8e3a 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -5,18 +5,64 @@
 
 extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
+static void
+ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+  BEING_IMPLEMENTED;
+
+  void *ptr = buffer->context;
+
+  if (ptr == nullptr) {
+        return;
+  }
+  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
+
+  struct vn_renderer_shmem *shmem;
+  size_t index;
+
+  for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) {
+    const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */;
+    const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */;
+    if (ptr >= addr && ptr < endr) {
+      shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */;
+      index = i;
+      break;
+    }
+  }
+
+  if (shmem == nullptr) {
+    WARNING("failed to free host shared memory: memory not in map\n");
+    return;
+  }
+
+  virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem);
+
+  device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index);
+}
+
 static ggml_backend_buffer_t
 ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
   BEING_IMPLEMENTED;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
-  UNUSED(gpu);
 
-  void *ctx = NULL;
+  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
 
-  NOT_IMPLEMENTED;
+  size += 32;  // Behave like the CPU buffer type (dixit ggml-vulkan)
 
-  STOP_HERE;
-  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size);
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+
+  if (!shmem) {
+    FATAL("Couldn't allocate the guest-host shared host buffer :/");
+  }
+
+  void *ptr = shmem->mmap_ptr;
+
+  device_ctx->shared_memory.push_back(std::make_tuple(ptr, size, shmem));
+
+  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+  buffer->buft = buft;
+  buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
+
+  return buffer;
 }
 
 static const char *
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 0d8912741ba0b..8715e60209e8c 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -13,6 +13,9 @@
 #define BUFFER_TO_HANDLE(name) \
   ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle
 
+#define GET_DEVICE_CONTEXT() \
+  (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \
+
 #define NOT_IMPLEMENTED							\
   do {									\
     static bool first = true;						\
@@ -51,6 +54,8 @@ struct ggml_backend_remoting_device_context {
   std::string name;
   std::string description;
 
+  std::vector<std::tuple<void*, size_t, struct vn_renderer_shmem *>> shared_memory;
+
   struct virtgpu *gpu;
 };
 
@@ -71,6 +76,7 @@ extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
 extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
 
+ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device);
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
 ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params);
 ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev);

From d3541665a8739870f70f4781240ed03ad134d9ad Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 09:38:31 +0200
Subject: [PATCH 069/117] remoting: add clear buffer and get_tensor

---
 .../backend-dispatched-buffer.cpp             | 53 ++++++++++++++++++-
 .../ggml-remotingbackend/backend-dispatched.h |  6 +++
 .../shared/apir_backend.h                     |  4 +-
 .../ggml-remotingbackend/shared/venus_cs.h    | 14 +++++
 .../shared/venus_cs_ggml.h                    | 40 ++++++++++----
 .../ggml-backend-buffer.cpp                   | 43 ++++-----------
 .../ggml-backend-device.cpp                   | 29 ++++++----
 .../ggml-backend-host-buffer-type.cpp         | 12 ++---
 .../ggml-backend-reg.cpp                      |  2 +-
 .../ggml-remotingfrontend/ggml-backend.cpp    |  3 +-
 .../virtgpu-forward-buffer.cpp                | 44 ++++++++++++++-
 .../ggml-remotingfrontend/virtgpu-forward.h   |  4 ++
 12 files changed, 189 insertions(+), 65 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index c217cecbd2aa0..8dfce029af40e 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -52,7 +52,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
        buffer, tensor, shmem_data, offset, size);
 #endif
 #if 0
-  void **addr = (void **)(uintptr_t)shmem_data;
+  void **addr = (void **)(uintptr_t) shmem_data;
   for (int i = 0; i <= 10; i++) {
     INFO("%s: %p | %llx", __func__, addr, *addr);
     addr++;
@@ -64,3 +64,54 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 
   return 0;
 }
+
+uint32_t
+backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  ggml_tensor *tensor;
+  // safe to remove the const qualifier here
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  size_t offset;
+  vn_decode_size_t(dec, &offset);
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+    if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  INFO("GET_TENSOR");
+
+  UNUSED(buffer);
+  UNUSED(tensor);
+  buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
+
+  return 0;
+}
+
+uint32_t
+backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  uint8_t value;
+  vn_decode_uint8_t(dec, &value);
+
+  buffer->iface.clear(buffer, value);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index faa3dacfc2297..76f1bb8a647b8 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -38,6 +38,8 @@ uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_c
 /* buffer */
 uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
@@ -62,6 +64,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   /* buffer */
   case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base";
   case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor";
+  case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear";
 
   default: return "unknown";
   }
@@ -88,4 +92,6 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   /* buffer */
   [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base,
   [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor,
+  [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear,
 };
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 96bbb59fda14c..f3eff8874ed90 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -50,9 +50,11 @@ typedef enum ApirBackendCommandType {
   /* buffer */
   APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13,
   APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14,
+  APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15,
+  APIR_COMMAND_TYPE_BUFFER_CLEAR = 16,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 15,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 17,
 } ApirBackendCommandType;
 
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index c8149a5b58a29..82c4091fded09 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -116,6 +116,20 @@ vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_
  * typed encode/decode
  */
 
+/* uint8_t */
+
+static inline void
+vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val)
+{
+  vn_encode(enc, sizeof(int), val, sizeof(*val));
+}
+
+static inline void
+vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val)
+{
+  vn_decode(dec, sizeof(int), val, sizeof(*val));
+}
+
 /* uint64_t */
 
 static inline size_t
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 8a73537a45204..2aa87b62fb338 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -14,16 +14,17 @@ static inline void
 vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
   size_t tensor_size = sizeof(*tensor);
 
-  if (tensor->view_src) {
-    FATAL("Cannot pass tensors with view_src");
-  }
   if (tensor->extra) {
     FATAL("Cannot pass tensors with extra");
   }
 
   if (tensor->src[0] && tensor->buffer) {
-    // not sure if the buffer needs to be updated inside the src tensors or not
-    FATAL("Cannot pass tensors with src and buffer");
+    static int first = 1;
+    if (first) {
+      // not sure if the buffer needs to be updated inside the src tensors or not
+      WARNING("Cannot pass tensors with src and buffer");
+      first = 0;
+    }
   }
 
   vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
@@ -37,9 +38,20 @@ vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
     vn_encode_ggml_buffer_handle(enc, &buffer_handle);
   }
 
+  if (tensor->view_src) {
+    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+  }
+
   for (int i = 0; tensor->src[i]; i++) {
-    const ggml_tensor *src_tensor = tensor->src[i];
-    vn_cs_encoder_write(enc, tensor_size, src_tensor, tensor_size);
+    const ggml_tensor *tensor_src = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+
+#if 0
+    if (tensor_src->buffer) {
+      apir_buffer_handle_t src_buffer_handle = ggml_buffer_to_apir_handle(tensor_src->buffer);
+      vn_encode_ggml_buffer_handle(enc, &src_buffer_handle);
+    }
+#endif
   }
 }
 
@@ -56,9 +68,19 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
     tensor->buffer = vn_decode_ggml_buffer(dec);
   }
 
+  if (tensor->view_src) {
+    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->view_src = tensor_view_src;
+  }
+
   for (int i = 0; tensor->src[i]; i++) {
-    ggml_tensor *src_tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
-    tensor->src[i] = src_tensor; // overwrite op->src[i] pointer with the actual location of the src tensor
+    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+#if 0
+    if (tensor_src->buffer) {
+      tensor_src->buffer = vn_decode_ggml_buffer(dec);
+    }
+#endif
   }
 
   return tensor;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 847a61297be8b..ed2c749958279 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -44,15 +44,10 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer
 }
 
 static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-  NOT_IMPLEMENTED;
-
-  STOP_HERE;
+  IMPLEMENTED_ONCE;
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  UNUSED(buffer);
-  UNUSED(tensor);
-  UNUSED(data);
-  UNUSED(offset);
-  UNUSED(size);
+  apir_buffer_get_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size);
 }
 
 
@@ -68,34 +63,14 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer
   UNUSED(dst);
 }
 
-static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  BEING_IMPLEMENTED;
-
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
-static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) {
-  NOT_IMPLEMENTED;
-
-  STOP_HERE;
-
-  UNUSED(ctx);
-  UNUSED(dst);
-  UNUSED(c);
-  UNUSED(size);
-  UNUSED(offset);
-}
-
 static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-  UNUSED(buffer);
-  UNUSED(value);
+  IMPLEMENTED;
 
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  STOP_HERE;
+  apir_buffer_clear(gpu, BUFFER_TO_HANDLE(buffer), value);
+
+  return;
 }
 
 static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -103,7 +78,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
 
   NOT_IMPLEMENTED;
 
-    STOP_HERE;
+  STOP_HERE;
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index ef48bd6fae96e..ad8be7d0bc69e 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -5,7 +5,7 @@
 
 static const char *
 ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
@@ -41,7 +41,7 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
 
 static bool
 ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  //IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
@@ -50,24 +50,33 @@ ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tens
 
 static bool
 ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+  IMPLEMENTED_ONCE;
+
+#if 1
+  bool supported = buft->device == dev;
+  if (!supported) {
+    //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft));
+  }
+
+  return supported;
+#else
   UNUSED(dev);
   UNUSED(buft);
 
-  NOT_IMPLEMENTED;
-
   return true;
+#endif
 }
 
 static bool
 ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  const int min_batch_size = 32;
+  IMPLEMENTED_ONCE;
 
-  NOT_IMPLEMENTED;
+  UNUSED(dev);
+  UNUSED(op);
 
-  return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
-    (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+  // related to supports_buft, need to confirm
 
-  UNUSED(dev);
+  return false; // same as ggml-metal
 }
 
 static void
@@ -103,7 +112,7 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
 
 ggml_backend_buffer_type_t
 ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
index faf051fcc8e3a..a355e9aebbbf9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -41,7 +41,7 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
 static ggml_backend_buffer_t
 ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-  BEING_IMPLEMENTED;
+  IMPLEMENTED;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
   struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
@@ -69,7 +69,7 @@ static const char *
 ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
   UNUSED(buft);
 
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   return "GUEST host buffer";
 }
@@ -78,18 +78,16 @@ static size_t
 ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
   UNUSED(buft);
 
-  NOT_IMPLEMENTED;
-  STOP_HERE;
+  IMPLEMENTED_ONCE;
 
-  return 4096;
+  return 64; // not 100% sure ...
 }
 
 static bool
 ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
   UNUSED(buft);
 
-  IMPLEMENTED;
-  STOP_HERE;
+  IMPLEMENTED_ONCE;
 
   return true;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 8b5eb5bbb189b..ca98528fd7e08 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -54,7 +54,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) {
   IMPLEMENTED;
 
   if (devices.size() > 0) {
-    INFO("%s: already initialized\n", __func__);
+    INFO("%s: already initialized", __func__);
   }
 
   struct virtgpu *gpu = apir_initialize();
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 4bd321b5fc5c9..97a0e53856df9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -3,7 +3,7 @@
 static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
   UNUSED(backend);
 
-  NOT_IMPLEMENTED;
+  IMPLEMENTED;
 
   return "API Remoting backend";
 }
@@ -18,6 +18,7 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g
   UNUSED(backend);
   UNUSED(cgraph);
 
+  NEXT;
   NOT_IMPLEMENTED;
 
   return GGML_STATUS_SUCCESS;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index dc991f84c07cc..f6c2ccb33b9e9 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -21,7 +21,6 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
   return (void *) base;
 }
 
-
 void
 apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
@@ -55,3 +54,46 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
   return;
 }
+
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
+
+  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_ggml_tensor(encoder, tensor);
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+  if (!shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  vn_encode_size_t(encoder, &offset);
+  vn_encode_size_t(encoder, &size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  memcpy(data, shmem->mmap_ptr, size);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  virtgpu_shmem_destroy(gpu, shmem->shmem);
+}
+
+void
+apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+		  uint8_t value) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
+
+  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_uint8_t(encoder, &value);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 2790adbb62454..6511850e666e9 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -31,3 +31,7 @@ void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_hand
 enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor);
 void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 			    ggml_tensor *tensor, const void *data, size_t offset, size_t size);
+void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
+void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+		       uint8_t value);

From 49bb02063ebb89d20a0c03b876575b818493a005 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 10:49:32 +0200
Subject: [PATCH 070/117] remoting: add skeleton for graph_compute method

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |  1 +
 .../backend-dispatched-backend.cpp            | 21 ++++++++++++++++++
 .../ggml-remotingbackend/backend-dispatched.h |  8 +++++++
 .../ggml-remotingbackend/backend-internal.h   |  1 +
 .../shared/apir_backend.h                     |  5 ++++-
 .../shared/venus_cs_ggml.h                    | 16 ++++++++++++++
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |  1 +
 .../ggml-backend-device.cpp                   |  3 ---
 .../ggml-remotingfrontend/ggml-backend.cpp    |  7 +++---
 .../src/ggml-remotingfrontend/ggml-remoting.h |  3 +++
 .../virtgpu-forward-backend.cpp               | 22 +++++++++++++++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |  4 ++++
 12 files changed, 85 insertions(+), 7 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index feca344c90a64..3d7255faf237f 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -6,6 +6,7 @@ message(STATUS "Enable API Remoting backend")
 ggml_add_backend_library(ggml-remotingbackend
                          backend.cpp
                          backend-dispatched.cpp
+                         backend-dispatched-backend.cpp
                          backend-dispatched-device.cpp
                          backend-dispatched-buffer.cpp
                          backend-dispatched-buffer-type.cpp
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
new file mode 100644
index 0000000000000..9a6fb941469aa
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -0,0 +1,21 @@
+#include <cstdint>
+#include "backend-internal.h"
+#include "backend-dispatched.h"
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+
+uint32_t
+backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec);
+
+  ggml_status status = bck->iface.graph_compute(bck, cgraph);
+
+  vn_encode_ggml_status(enc, &status);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 76f1bb8a647b8..5464f56baf152 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -41,6 +41,9 @@ uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decod
 uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
+/* backend */
+uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+
 static inline const char *backend_dispatch_command_name(ApirBackendCommandType type)
 {
   switch (type) {
@@ -67,6 +70,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor";
   case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear";
 
+  /* backend */
+  case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute";
   default: return "unknown";
   }
 }
@@ -94,4 +99,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
   [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor,
   [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear,
+
+  /* backend */
+  [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute,
 };
diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h
index 5c29e18d4596a..41bc42dbc0e36 100644
--- a/ggml/src/ggml-remotingbackend/backend-internal.h
+++ b/ggml/src/ggml-remotingbackend/backend-internal.h
@@ -8,6 +8,7 @@
 
 extern ggml_backend_reg_t reg;
 extern ggml_backend_dev_t dev;
+extern ggml_backend_t bck;
 
 #define NOT_IMPLEMENTED							\
   do {									\
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index f3eff8874ed90..2608909b7541e 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -53,8 +53,11 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15,
   APIR_COMMAND_TYPE_BUFFER_CLEAR = 16,
 
+  /* backend */
+  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 17,
+
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 17,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 18,
 } ApirBackendCommandType;
 
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 2aa87b62fb338..c769d9d860a20 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -151,3 +151,19 @@ static inline void
 vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) {
   vn_decode_uint32_t(dec, shmem_res_id);
 }
+
+/* ggml_cgraph */
+
+static inline void
+vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph) {
+  UNUSED(enc);
+  UNUSED(cgraph);
+
+}
+
+static inline ggml_cgraph *
+vn_decode_ggml_cgraph(struct vn_cs_decoder *dec) {
+  UNUSED(dec);
+
+  return NULL;
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index b77a0254a7a6c..a13f48b7ef81b 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -16,6 +16,7 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu-forward-device.cpp
                          virtgpu-forward-buffer-type.cpp
                          virtgpu-forward-buffer.cpp
+                         virtgpu-forward-backend.cpp
                          virtgpu-forward-impl.h
                          ../../include/ggml-remoting-frontend.h
                         )
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index ad8be7d0bc69e..6d51643962d80 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -1,8 +1,5 @@
 #include "ggml-remoting.h"
 
-#define DEV_TO_GPU(name) \
-  ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
-
 static const char *
 ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) {
   IMPLEMENTED_ONCE;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 97a0e53856df9..190fde76a30bc 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -18,10 +18,11 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g
   UNUSED(backend);
   UNUSED(cgraph);
 
-  NEXT;
-  NOT_IMPLEMENTED;
+  struct virtgpu *gpu = DEV_TO_GPU(backend->device);
 
-  return GGML_STATUS_SUCCESS;
+  BEING_IMPLEMENTED;
+
+  return apir_backend_graph_compute(gpu, cgraph);
 }
 
 static ggml_backend_i ggml_backend_remoting_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 8715e60209e8c..d51afbe19dc78 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -10,6 +10,9 @@
 #include "ggml-backend.h"
 #include "virtgpu.h"
 
+#define DEV_TO_GPU(name) \
+  ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
+
 #define BUFFER_TO_HANDLE(name) \
   ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
new file mode 100644
index 0000000000000..4a8214265209c
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -0,0 +1,22 @@
+#include "virtgpu-forward-impl.h"
+
+ggml_status
+apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
+  ggml_status status;
+  UNUSED(cgraph);
+
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
+
+  vn_encode_ggml_cgraph(encoder, cgraph);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_ggml_status(decoder, &status);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return status;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 6511850e666e9..d59cd754eb803 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -35,3 +35,7 @@ void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_han
 			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
 void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 		       uint8_t value);
+
+/* backend */
+
+ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph);

From 8edd5e6a9af276a20ec39d908a5b13147e2d394c Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 11:55:16 +0200
Subject: [PATCH 071/117] remoting: continue the compute_graph skeleton

---
 .../backend-dispatched-backend.cpp            | 14 ++++++-
 .../ggml-remotingbackend/shared/venus_cs.h    | 25 ++++++++++++
 .../shared/venus_cs_ggml.h                    | 40 +++++++++++++++++--
 .../virtgpu-forward-backend.cpp               | 14 ++++++-
 4 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index 9a6fb941469aa..60f5708528d12 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -11,7 +11,19 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
   UNUSED(ctx);
   UNUSED(enc);
 
-  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec);
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+  size_t shmem_size;
+  vn_decode_size_t(dec, &shmem_size);
+
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size);
+
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec);
 
   ggml_status status = bck->iface.graph_compute(bck, cgraph);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 82c4091fded09..a780cb6e40fda 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -20,6 +20,31 @@ struct vn_cs_decoder {
   const char* end;
 };
 
+/*
+ * new encoder and decoder
+ */
+
+static struct vn_cs_decoder
+vn_cs_new_decoder(const char *ptr, size_t size) {
+  struct vn_cs_decoder dec = {
+      .cur = ptr,
+      .end = ptr + size,
+  };
+
+  return dec;
+}
+
+static struct vn_cs_encoder
+vn_cs_new_encoder(char *ptr, size_t size) {
+  struct vn_cs_encoder enc = {
+      .cur = ptr,
+      .start = ptr,
+      .end = ptr + size,
+  };
+
+  return enc;
+}
+
 /*
  * encode peek
  */
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index c769d9d860a20..3a72b906a2634 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -154,16 +154,48 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id
 
 /* ggml_cgraph */
 
+static inline size_t
+vn_encode_sizeof_ggml_cgraph(ggml_cgraph *cgraph) {
+  return sizeof(*cgraph);
+}
+
 static inline void
-vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph) {
+vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) {
   UNUSED(enc);
   UNUSED(cgraph);
 
+  if (cgraph->n_leafs) {
+    FATAL("Cannot pass cgraphs with leaves");
+  }
+  if (cgraph->size) {
+    FATAL("Cannot pass cgraphs with size");
+  }
+  if (cgraph->grads) {
+    FATAL("Cannot pass cgraphs with grads");
+  }
+  if (cgraph->grad_accs) {
+    FATAL("Cannot pass cgraphs with grad_accs");
+  }
+  if (cgraph->visited_hash_set.size || cgraph->visited_hash_set.used || cgraph->visited_hash_set.keys) {
+    FATAL("Cannot pass cgraphs with visited_hash_set");
+  }
+
+  if (!secondary_enc) {
+    return;
+  }
+
+  size_t cgraph_size = sizeof(*cgraph);
+  vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size);
 }
 
 static inline ggml_cgraph *
-vn_decode_ggml_cgraph(struct vn_cs_decoder *dec) {
-  UNUSED(dec);
+vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) {
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph));
 
-  return NULL;
+  if (!secondary_dec) {
+    return NULL;
+  }
+  return cgraph;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index 4a8214265209c..1ce0ad0280c29 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -10,7 +10,17 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
 
-  vn_encode_ggml_cgraph(encoder, cgraph);
+  size_t size = vn_encode_sizeof_ggml_cgraph(cgraph);
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+  if (!shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+  vn_encode_size_t(encoder, &size);
+
+  char *shmem_data = (char *) shmem->mmap_ptr;
+  struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, size);
+
+  vn_encode_ggml_cgraph(encoder, cgraph, &secondary_enc);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -18,5 +28,7 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
+  virtgpu_shmem_destroy(gpu, shmem->shmem);
+
   return status;
 }

From 372e6d06798e74132bc9d8d4752dd203a82da8f0 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 15:11:17 +0200
Subject: [PATCH 072/117] Continue the skeleton

---
 .../backend-dispatched-backend.cpp            |  9 ++-
 .../ggml-remotingbackend/shared/venus_cs.h    |  1 +
 .../shared/venus_cs_ggml.h                    | 62 ++++++++++++++++---
 .../virtgpu-forward-backend.cpp               |  8 ++-
 4 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index 60f5708528d12..72c01c7fa2777 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -13,19 +13,22 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
-
+  INFO("Receive shmem id %d", shmem_res_id);
   const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
   if (!shmem_data) {
     FATAL("Couldn't get the shmem addr from virgl :/");
   }
   size_t shmem_size;
   vn_decode_size_t(dec, &shmem_size);
-
+  INFO("Receive shmem size %ld", shmem_size);
   struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size);
 
   ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec);
 
-  ggml_status status = bck->iface.graph_compute(bck, cgraph);
+  ggml_status status = GGML_STATUS_SUCCESS;
+  status = bck->iface.graph_compute(bck, cgraph);
+
+  INFO("Send status %u", status);
 
   vn_encode_ggml_status(enc, &status);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index a780cb6e40fda..510cbd6fcce66 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -90,6 +90,7 @@ vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec,
 
   return addr;
 }
+
 /*
  * read/write
  */
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 3a72b906a2634..e254228017531 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -86,6 +86,23 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
   return tensor;
 }
 
+/* tensor array */
+
+static inline void
+vn_encode_ggml_tensor_array(struct vn_cs_encoder *enc, ggml_tensor **addr, size_t count)
+{
+  size_t buffer_size = sizeof(*addr) * count;
+  vn_encode(enc, buffer_size, addr, buffer_size);
+}
+
+static inline ggml_tensor **
+vn_decode_ggml_tensor_array_inplace(struct vn_cs_decoder *dec, size_t count)
+{
+  size_t buffer_size = sizeof(ggml_tensor*) * count;
+
+  return (ggml_tensor **)(uintptr_t) vn_cs_decoder_use_inplace(dec, buffer_size);
+}
+
 /* *** ggml_backend_buffer_type_t *** */
 
 // ggml_backend_buffer_type_t is a POINTER (to a struct).
@@ -132,7 +149,7 @@ vn_decode_ggml_buffer(struct vn_cs_decoder *dec) {
 
 static inline void
 vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) {
-  vn_cs_encoder_write(enc, sizeof(*status), &status, sizeof(*status));
+  vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status));
 }
 
 static inline void
@@ -155,8 +172,29 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id
 /* ggml_cgraph */
 
 static inline size_t
-vn_encode_sizeof_ggml_cgraph(ggml_cgraph *cgraph) {
-  return sizeof(*cgraph);
+vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) {
+  /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */
+  size_t size = 0;
+
+  size += sizeof(ggml_tensor*) * cgraph->n_nodes;
+
+  size_t tensor_size = sizeof(ggml_tensor);
+  INFO("tensor_size: %lu", tensor_size);
+  size += tensor_size * cgraph->n_nodes;
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    ggml_tensor *tensor = cgraph->nodes[i];
+    if (tensor->buffer) {
+      size += sizeof(apir_buffer_handle_t);
+    }
+    if (tensor->view_src) {
+      size += tensor_size;
+    }
+    for (int j = 0; tensor->src[j]; j++) {
+      size += tensor_size;
+    }
+  }
+  return size;
 }
 
 static inline void
@@ -180,12 +218,15 @@ vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_
     FATAL("Cannot pass cgraphs with visited_hash_set");
   }
 
-  if (!secondary_enc) {
-    return;
-  }
-
   size_t cgraph_size = sizeof(*cgraph);
   vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size);
+
+  vn_encode_ggml_tensor_array(secondary_enc, cgraph->nodes, cgraph->n_nodes);
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    ggml_tensor *tensor = cgraph->nodes[i];
+    vn_encode_ggml_tensor(secondary_enc, tensor);
+  }
 }
 
 static inline ggml_cgraph *
@@ -194,8 +235,11 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary
   // modify the shared memory data to fix the `src` pointers.
   ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph));
 
-  if (!secondary_dec) {
-    return NULL;
+  cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes);
+
+  for (int i = 0; i < cgraph->n_nodes; i++) {
+    cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec);
   }
+
   return cgraph;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index 1ce0ad0280c29..8d18c18f8e2b5 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -2,7 +2,6 @@
 
 ggml_status
 apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
-  ggml_status status;
   UNUSED(cgraph);
 
   struct vn_cs_encoder *encoder;
@@ -10,11 +9,14 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
 
-  size_t size = vn_encode_sizeof_ggml_cgraph(cgraph);
+  size_t size = vn_encode_sizeof_ggml_cgraph_data(cgraph);
   struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
   if (!shmem) {
     FATAL("Couldn't allocate the guest-host shared buffer :/");
   }
+  INFO("Send shmem ID %d", shmem->res_id);
+  vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
+  INFO("Send shmem size %lu", size);
   vn_encode_size_t(encoder, &size);
 
   char *shmem_data = (char *) shmem->mmap_ptr;
@@ -24,7 +26,9 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL(gpu, encoder, decoder);
 
+  ggml_status status = GGML_STATUS_ABORTED;
   vn_decode_ggml_status(decoder, &status);
+  INFO("Received status %u", status);
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 

From 1a82665297f8d7760f74ec96f3c05292acf04166 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 15:44:40 +0200
Subject: [PATCH 073/117] remoting: recursively encode/decode the tensors

---
 .../backend-dispatched-buffer.cpp             |   4 +-
 .../backend-dispatched-device.cpp             |   2 +-
 .../shared/apir_backend.h                     |   5 +
 .../ggml-remotingbackend/shared/venus_cs.h    |   7 +-
 .../shared/venus_cs_ggml.h                    | 107 ++++++++----------
 .../src/ggml-remotingfrontend/ggml-remoting.h |   3 +
 .../virtgpu-forward-buffer.cpp                |   4 +-
 .../virtgpu-forward-device.cpp                |   3 +-
 8 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index 8dfce029af40e..d181937f55256 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -30,7 +30,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 
   ggml_tensor *tensor;
   // safe to remove the const qualifier here
-  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec);
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR);
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
@@ -75,7 +75,7 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 
   ggml_tensor *tensor;
   // safe to remove the const qualifier here
-  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec);
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR);
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index ba2ec479a95c0..72d159bb676b9 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -72,7 +72,7 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 uint32_t
 backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
-  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP);
 
   bool supports_op = dev->iface.supports_op(dev, op);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 2608909b7541e..14b0c21240547 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -70,3 +70,8 @@ struct virgl_apir_context {
 
   struct virgl_apir_callbacks iface;
 };
+
+#define TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP 2
+#define TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR 2
+#define TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR 2
+#define TENSOR_MAX_DEPTH_CGRAPH_DATA 10
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 510cbd6fcce66..fdfb498576347 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -105,7 +105,7 @@ vn_cs_decoder_read(struct vn_cs_decoder *dec,
     dec->cur += size;
 }
 
-static inline void
+static inline char *
 vn_cs_encoder_write(struct vn_cs_encoder *enc,
                     size_t size,
                     const void *val,
@@ -114,9 +114,12 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc,
   assert(val_size <= size);
   assert(size <= ((size_t) (enc->end - enc->cur)));
 
+  char *write_addr = enc->cur;
   /* we should not rely on the compiler to optimize away memcpy... */
-  memcpy(enc->cur, val, val_size);
+  memcpy(write_addr, val, val_size);
   enc->cur += size;
+
+  return write_addr;
 }
 
 /*
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index e254228017531..e4350971c76fa 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -10,24 +10,37 @@ vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle
 static inline ggml_backend_buffer_t
 vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
 
+/* ggml_tensor */
+
+static inline size_t
+vn_encode_sizeof_ggml_tensor(const ggml_tensor *tensor, int depth_to_go) {
+  /* must match the encoding vn_encode_ggml_tensor */
+  size_t size = 0;
+  size_t tensor_size = sizeof(ggml_tensor);
+
+  size += tensor_size; // the main tensor
+
+  if (depth_to_go != 0) {
+    if (tensor->view_src) {
+      size += vn_encode_sizeof_ggml_tensor(tensor->view_src, depth_to_go-1);
+    }
+
+    for (int i = 0; tensor->src[i]; i++) {
+      size += vn_encode_sizeof_ggml_tensor(tensor->src[i], depth_to_go-1);
+    }
+  }
+  return size;
+}
+
 static inline void
-vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor, int depth_to_go) {
   size_t tensor_size = sizeof(*tensor);
 
   if (tensor->extra) {
     FATAL("Cannot pass tensors with extra");
   }
 
-  if (tensor->src[0] && tensor->buffer) {
-    static int first = 1;
-    if (first) {
-      // not sure if the buffer needs to be updated inside the src tensors or not
-      WARNING("Cannot pass tensors with src and buffer");
-      first = 0;
-    }
-  }
-
-  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
+  ggml_tensor *cs_tensor = (ggml_tensor *) vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
 
   // tensor->data is a pointer inside the device buffer. No need to touch it
   // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
@@ -35,52 +48,40 @@ vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
 
   if (tensor->buffer) {
     apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
-    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+    cs_tensor->buffer = (ggml_backend_buffer *) buffer_handle;
   }
 
-  if (tensor->view_src) {
-    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
-  }
-
-  for (int i = 0; tensor->src[i]; i++) {
-    const ggml_tensor *tensor_src = tensor->src[i];
-    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+  if (depth_to_go != 0) {
+    if (tensor->view_src) {
+      vn_encode_ggml_tensor(enc, tensor->view_src, depth_to_go-1);
+    }
 
-#if 0
-    if (tensor_src->buffer) {
-      apir_buffer_handle_t src_buffer_handle = ggml_buffer_to_apir_handle(tensor_src->buffer);
-      vn_encode_ggml_buffer_handle(enc, &src_buffer_handle);
+    for (int i = 0; tensor->src[i]; i++) {
+      vn_encode_ggml_tensor(enc, tensor->src[i], depth_to_go-1);
     }
-#endif
   }
 }
 
-static inline const ggml_tensor *
-vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+static inline ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec, int depth_to_go) {
 
   // it safe to remove the `const` qualifier here, we *do* want to
   // modify the shared memory data to fix the `src` pointers.
   ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
 
   // tensor->data is a pointer inside the device buffer. No need to touch it
-  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
-  if (tensor->buffer) {
-    tensor->buffer = vn_decode_ggml_buffer(dec);
-  }
+  // tensor->buffer has already been updated to the correct pointer
 
-  if (tensor->view_src) {
-    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
-    tensor->view_src = tensor_view_src;
-  }
+  if (depth_to_go != 0) {
+    if (tensor->view_src) {
+      ggml_tensor *tensor_view_src = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1);
+      tensor->view_src = tensor_view_src;
+    }
 
-  for (int i = 0; tensor->src[i]; i++) {
-    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
-    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
-#if 0
-    if (tensor_src->buffer) {
-      tensor_src->buffer = vn_decode_ggml_buffer(dec);
+    for (int i = 0; tensor->src[i]; i++) {
+      ggml_tensor *tensor_src_i = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1);
+      tensor->src[i] = tensor_src_i;
     }
-#endif
   }
 
   return tensor;
@@ -176,24 +177,16 @@ vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) {
   /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */
   size_t size = 0;
 
-  size += sizeof(ggml_tensor*) * cgraph->n_nodes;
+  // don't include the `ggml_cgraph`, only it's data
 
-  size_t tensor_size = sizeof(ggml_tensor);
-  INFO("tensor_size: %lu", tensor_size);
-  size += tensor_size * cgraph->n_nodes;
+  // include the array of tensors
+  size += sizeof(ggml_tensor*) * cgraph->n_nodes;
 
+  // include the size of all the tensors
   for (int i = 0; i < cgraph->n_nodes; i++) {
-    ggml_tensor *tensor = cgraph->nodes[i];
-    if (tensor->buffer) {
-      size += sizeof(apir_buffer_handle_t);
-    }
-    if (tensor->view_src) {
-      size += tensor_size;
-    }
-    for (int j = 0; tensor->src[j]; j++) {
-      size += tensor_size;
-    }
+    size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA);
   }
+
   return size;
 }
 
@@ -225,7 +218,7 @@ vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_
 
   for (int i = 0; i < cgraph->n_nodes; i++) {
     ggml_tensor *tensor = cgraph->nodes[i];
-    vn_encode_ggml_tensor(secondary_enc, tensor);
+    vn_encode_ggml_tensor(secondary_enc, tensor, TENSOR_MAX_DEPTH_CGRAPH_DATA);
   }
 }
 
@@ -238,7 +231,7 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary
   cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes);
 
   for (int i = 0; i < cgraph->n_nodes; i++) {
-    cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec);
+    cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec, TENSOR_MAX_DEPTH_CGRAPH_DATA);
   }
 
   return cgraph;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index d51afbe19dc78..6a8bf2ea75713 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -71,6 +71,9 @@ struct ggml_backend_remoting_buffer_context {
 static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
   struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
 
+  if (!context) {
+    return 0;
+  }
   return context->handle;
 }
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index f6c2ccb33b9e9..cfe89a19b761e 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -33,7 +33,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
 
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
-  vn_encode_ggml_tensor(encoder, tensor);
+  vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR);
 
   struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
   if (!shmem) {
@@ -64,7 +64,7 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
 
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
-  vn_encode_ggml_tensor(encoder, tensor);
+  vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR);
   struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
   if (!shmem) {
     FATAL("Couldn't allocate the guest-host shared buffer :/");
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 7c241d71a1679..4c7b1da767068 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -140,8 +140,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
 
-  vn_encode_ggml_tensor(encoder, op);
-
+  vn_encode_ggml_tensor(encoder, op, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP);
 
   REMOTE_CALL(gpu, encoder, decoder);
 

From 6ce806bc13b9fe875b9f199a18679d56d30a229e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 20 May 2025 17:57:42 +0200
Subject: [PATCH 074/117] keep working

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |   1 +
 .../backend-dispatched-backend.cpp            |  12 +-
 .../backend-dispatched-buffer.cpp             |   9 +-
 .../backend-dispatched-device.cpp             |   3 +-
 .../shared/venus_cs_ggml.h                    | 146 ++++--------------
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   1 +
 .../ggml-remotingfrontend/ggml-backend.cpp    |   4 +-
 .../virtgpu-forward-buffer.cpp                |   6 +-
 .../virtgpu-forward-device.cpp                |  14 +-
 9 files changed, 64 insertions(+), 132 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index 3d7255faf237f..f9a63ef60a445 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -14,6 +14,7 @@ ggml_add_backend_library(ggml-remotingbackend
                          shared/api_remoting.h
                          shared/apir_backend.h
                          shared/venus_cs.h
+                         shared/venus_cs_ggml-rpc.cpp
                         )
 
 target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index 72c01c7fa2777..61619f4c94f6b 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -13,22 +13,24 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
-  INFO("Receive shmem id %d", shmem_res_id);
+
   const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
   if (!shmem_data) {
     FATAL("Couldn't get the shmem addr from virgl :/");
   }
   size_t shmem_size;
   vn_decode_size_t(dec, &shmem_size);
-  INFO("Receive shmem size %ld", shmem_size);
+
   struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size);
 
   ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec);
 
   ggml_status status = GGML_STATUS_SUCCESS;
-  status = bck->iface.graph_compute(bck, cgraph);
-
-  INFO("Send status %u", status);
+  if (false) {
+    status = bck->iface.graph_compute(bck, cgraph);
+  } else {
+    WARNING("SKIPPING backend->graph_compute()");
+  }
 
   vn_encode_ggml_status(enc, &status);
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index d181937f55256..70d86677d15b1 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -30,7 +30,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 
   ggml_tensor *tensor;
   // safe to remove the const qualifier here
-  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR);
+  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec);
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
@@ -73,9 +73,10 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   ggml_backend_buffer_t buffer;
   buffer = vn_decode_ggml_buffer(dec);
 
-  ggml_tensor *tensor;
+
+  const ggml_tensor *tensor;
   // safe to remove the const qualifier here
-  tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR);
+  tensor = vn_decode_ggml_tensor(dec);
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
@@ -91,8 +92,6 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
     FATAL("Couldn't get the shmem addr from virgl :/");
   }
 
-  INFO("GET_TENSOR");
-
   UNUSED(buffer);
   UNUSED(tensor);
   buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 72d159bb676b9..863c2698779e7 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -72,7 +72,8 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 uint32_t
 backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
-  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP);
+
+  const ggml_tensor *op = vn_decode_ggml_tensor(dec);
 
   bool supports_op = dev->iface.supports_op(dev, op);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index e4350971c76fa..61f3a810ebc01 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -1,6 +1,8 @@
 // needs the ggml-backend-impl.h definition
 // needs venus_cs.h definition
 
+#include "venus_cs_ggml-rpc.h"
+
 // needs
 // ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer);
 
@@ -10,98 +12,43 @@ vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle
 static inline ggml_backend_buffer_t
 vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
 
-/* ggml_tensor */
-
-static inline size_t
-vn_encode_sizeof_ggml_tensor(const ggml_tensor *tensor, int depth_to_go) {
-  /* must match the encoding vn_encode_ggml_tensor */
-  size_t size = 0;
-  size_t tensor_size = sizeof(ggml_tensor);
-
-  size += tensor_size; // the main tensor
-
-  if (depth_to_go != 0) {
-    if (tensor->view_src) {
-      size += vn_encode_sizeof_ggml_tensor(tensor->view_src, depth_to_go-1);
-    }
-
-    for (int i = 0; tensor->src[i]; i++) {
-      size += vn_encode_sizeof_ggml_tensor(tensor->src[i], depth_to_go-1);
-    }
-  }
-  return size;
-}
+/* rpc_tensor */
 
 static inline void
-vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor, int depth_to_go) {
-  size_t tensor_size = sizeof(*tensor);
-
-  if (tensor->extra) {
-    FATAL("Cannot pass tensors with extra");
-  }
-
-  ggml_tensor *cs_tensor = (ggml_tensor *) vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
-
-  // tensor->data is a pointer inside the device buffer. No need to touch it
-  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
-  // (could also make a copy of the tensor, and update locally.)
-
-  if (tensor->buffer) {
-    apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
-    cs_tensor->buffer = (ggml_backend_buffer *) buffer_handle;
-  }
-
-  if (depth_to_go != 0) {
-    if (tensor->view_src) {
-      vn_encode_ggml_tensor(enc, tensor->view_src, depth_to_go-1);
-    }
-
-    for (int i = 0; tensor->src[i]; i++) {
-      vn_encode_ggml_tensor(enc, tensor->src[i], depth_to_go-1);
-    }
-  }
+vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) {
+  size_t rpc_tensor_size = sizeof(*rpc_tensor);
+  vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size);
 }
 
-static inline ggml_tensor *
-vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec, int depth_to_go) {
-
-  // it safe to remove the `const` qualifier here, we *do* want to
-  // modify the shared memory data to fix the `src` pointers.
-  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
-
-  // tensor->data is a pointer inside the device buffer. No need to touch it
-  // tensor->buffer has already been updated to the correct pointer
+static inline rpc_tensor *
+vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor);
 
-  if (depth_to_go != 0) {
-    if (tensor->view_src) {
-      ggml_tensor *tensor_view_src = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1);
-      tensor->view_src = tensor_view_src;
-    }
-
-    for (int i = 0; tensor->src[i]; i++) {
-      ggml_tensor *tensor_src_i = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1);
-      tensor->src[i] = tensor_src_i;
-    }
-  }
-
-  return tensor;
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
 }
 
-/* tensor array */
+/* ggml_tensor */
 
 static inline void
-vn_encode_ggml_tensor_array(struct vn_cs_encoder *enc, ggml_tensor **addr, size_t count)
-{
-  size_t buffer_size = sizeof(*addr) * count;
-  vn_encode(enc, buffer_size, addr, buffer_size);
+vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  rpc_tensor serialized = serialize_tensor(tensor);
+
+  vn_encode_rcp_tensor(enc, &serialized);
 }
 
-static inline ggml_tensor **
-vn_decode_ggml_tensor_array_inplace(struct vn_cs_decoder *dec, size_t count)
-{
-  size_t buffer_size = sizeof(ggml_tensor*) * count;
+static inline const ggml_tensor *
+vn_decode_ggml_tensor(struct vn_cs_decoder *dec) {
+  const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec);
+  struct ggml_init_params params {
+    /*.mem_size   =*/ ggml_tensor_overhead(),
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
 
-  return (ggml_tensor **)(uintptr_t) vn_cs_decoder_use_inplace(dec, buffer_size);
+  const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor);
+
+  return tensor;
 }
 
 /* *** ggml_backend_buffer_type_t *** */
@@ -186,7 +133,7 @@ vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) {
   for (int i = 0; i < cgraph->n_nodes; i++) {
     size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA);
   }
-
+  INFO("SIZEOF(cgraph) --> %lu", size);
   return size;
 }
 
@@ -194,45 +141,16 @@ static inline void
 vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) {
   UNUSED(enc);
   UNUSED(cgraph);
-
-  if (cgraph->n_leafs) {
-    FATAL("Cannot pass cgraphs with leaves");
-  }
-  if (cgraph->size) {
-    FATAL("Cannot pass cgraphs with size");
-  }
-  if (cgraph->grads) {
-    FATAL("Cannot pass cgraphs with grads");
-  }
-  if (cgraph->grad_accs) {
-    FATAL("Cannot pass cgraphs with grad_accs");
-  }
-  if (cgraph->visited_hash_set.size || cgraph->visited_hash_set.used || cgraph->visited_hash_set.keys) {
-    FATAL("Cannot pass cgraphs with visited_hash_set");
-  }
-
-  size_t cgraph_size = sizeof(*cgraph);
-  vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size);
-
-  vn_encode_ggml_tensor_array(secondary_enc, cgraph->nodes, cgraph->n_nodes);
-
-  for (int i = 0; i < cgraph->n_nodes; i++) {
-    ggml_tensor *tensor = cgraph->nodes[i];
-    vn_encode_ggml_tensor(secondary_enc, tensor, TENSOR_MAX_DEPTH_CGRAPH_DATA);
-  }
+  UNUSED(secondary_enc);
 }
 
 static inline ggml_cgraph *
 vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) {
   // it safe to remove the `const` qualifier here, we *do* want to
   // modify the shared memory data to fix the `src` pointers.
-  ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph));
-
-  cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes);
 
-  for (int i = 0; i < cgraph->n_nodes; i++) {
-    cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec, TENSOR_MAX_DEPTH_CGRAPH_DATA);
-  }
+  UNUSED(dec);
+  UNUSED(secondary_dec);
 
-  return cgraph;
+  return NULL;
 }
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index a13f48b7ef81b..e0b305fc26c3f 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -19,6 +19,7 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu-forward-backend.cpp
                          virtgpu-forward-impl.h
                          ../../include/ggml-remoting-frontend.h
+                         ../ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
                         )
 
 target_link_libraries(ggml-remotingfrontend PUBLIC drm)
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 190fde76a30bc..05383ff99f0a5 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -20,7 +20,9 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g
 
   struct virtgpu *gpu = DEV_TO_GPU(backend->device);
 
-  BEING_IMPLEMENTED;
+  IMPLEMENTED;
+
+  STOP_HERE;
 
   return apir_backend_graph_compute(gpu, cgraph);
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index cfe89a19b761e..1a95f2f4721e5 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -26,6 +26,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
+
 #if 0
   INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
     buffer_handle, tensor, data, offset, size);
@@ -33,7 +34,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
 
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
-  vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR);
+  vn_encode_ggml_tensor(encoder, tensor);
 
   struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
   if (!shmem) {
@@ -64,7 +65,8 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
 
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
-  vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR);
+  vn_encode_ggml_tensor(encoder, tensor);
+
   struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
   if (!shmem) {
     FATAL("Couldn't allocate the guest-host shared buffer :/");
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 4c7b1da767068..5ee2c01dd50ab 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -135,23 +135,29 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
 
 bool
 apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
+#if 1
+  /* ggml-rpc cheats it like this */
+  /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
+  UNUSED(gpu);
+  UNUSED(op);
+
+  return true;
+#else
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
-
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
 
-  vn_encode_ggml_tensor(encoder, op, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP);
+  vn_encode_ggml_tensor(encoder, op);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
   bool supports_op;
   vn_decode_bool_t(decoder, &supports_op);
 
-  /* *** */
-
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return supports_op;
+#endif
 }
 
 apir_buffer_type_handle_t

From 6fc0c88e029a499dc925aa764fa46071ef17c204 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 21 May 2025 14:11:36 +0200
Subject: [PATCH 075/117] start using the ggml-rpc serialization methods

---
 ggml/src/ggml-remotingbackend/CMakeLists.txt  |   2 +-
 .../backend-dispatched-backend.cpp            |  16 +-
 .../backend-dispatched-buffer-type.cpp        |   4 +
 .../ggml-remotingbackend/shared/venus_cs.h    |   6 +
 .../shared/venus_cs_ggml-rpc.cpp              | 167 ++++++++++++++++++
 .../shared/venus_cs_ggml-rpc.h                |  43 +++++
 .../shared/venus_cs_ggml.h                    |  48 ++---
 .../venus_cs_ggml-rpc-back.cpp                |  97 ++++++++++
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   2 +-
 .../ggml-backend-buffer-type.cpp              |   8 +-
 .../ggml-backend-device.cpp                   |   2 +-
 .../ggml-remotingfrontend/ggml-backend.cpp    |   6 +-
 .../src/ggml-remotingfrontend/ggml-remoting.h |   3 +
 .../venus_cs_ggml-rpc-front.cpp               |  84 +++++++++
 .../virtgpu-forward-backend.cpp               |  20 ++-
 .../ggml-remotingfrontend/virtgpu-utils.cpp   |   5 +
 .../src/ggml-remotingfrontend/virtgpu-utils.h |   1 +
 17 files changed, 461 insertions(+), 53 deletions(-)
 create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
 create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
 create mode 100644 ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
 create mode 100644 ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp

diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt
index f9a63ef60a445..7e374d395f68c 100644
--- a/ggml/src/ggml-remotingbackend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt
@@ -14,7 +14,7 @@ ggml_add_backend_library(ggml-remotingbackend
                          shared/api_remoting.h
                          shared/apir_backend.h
                          shared/venus_cs.h
-                         shared/venus_cs_ggml-rpc.cpp
+                         venus_cs_ggml-rpc-back.cpp
                         )
 
 target_compile_options(ggml-remotingbackend PRIVATE -std=c++20)
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index 61619f4c94f6b..f34a5b8c4d645 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -18,19 +18,15 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
   if (!shmem_data) {
     FATAL("Couldn't get the shmem addr from virgl :/");
   }
-  size_t shmem_size;
-  vn_decode_size_t(dec, &shmem_size);
+  size_t cgraph_size;
+  vn_decode_size_t(dec, &cgraph_size);
 
-  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size);
+  struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size);
 
-  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec);
+  ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
 
-  ggml_status status = GGML_STATUS_SUCCESS;
-  if (false) {
-    status = bck->iface.graph_compute(bck, cgraph);
-  } else {
-    WARNING("SKIPPING backend->graph_compute()");
-  }
+  ggml_status status;
+  status = bck->iface.graph_compute(bck, cgraph);
 
   vn_encode_ggml_status(enc, &status);
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index f09592ea5df43..8c3349a367dfc 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -70,5 +70,9 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
   apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer;
   vn_encode_ggml_buffer_handle(enc, buffer_handle);
 
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
   return 0;
 }
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index fdfb498576347..93af8fd287c81 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -205,6 +205,12 @@ vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t coun
   vn_decode(dec, size, val, size);
 }
 
+static inline const uint64_t *
+vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count)
+{
+  return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t));
+}
+
 /* int32_t */
 
 static inline size_t
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
new file mode 100644
index 0000000000000..196cd70958745
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
@@ -0,0 +1,167 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HANDLE(buffer);
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor->data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
new file mode 100644
index 0000000000000..a50405a479221
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
@@ -0,0 +1,43 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+
+// ggml_tensor is serialized into rpc_tensor
+struct rpc_tensor {
+  uint64_t id;
+  uint32_t type;
+  uint64_t buffer;
+  uint32_t ne[GGML_MAX_DIMS];
+  uint32_t nb[GGML_MAX_DIMS];
+  uint32_t op;
+  int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+  int32_t  flags;
+  uint64_t src[GGML_MAX_SRC];
+  uint64_t view_src;
+  uint64_t view_offs;
+  uint64_t data;
+  char name[GGML_MAX_NAME];
+
+  char padding[4];
+};
+
+/* frontend */
+
+rpc_tensor serialize_tensor(const ggml_tensor * tensor);
+
+void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
+
+/* backend */
+
+void track_backend_buffer(ggml_backend_buffer_t buffer);
+
+void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited);
+
+ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
+
+ggml_tensor *create_node(uint64_t id,
+			 struct ggml_context * ctx,
+			 const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+			 std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map);
+
+ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes);
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 61f3a810ebc01..c32ac91650e4d 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -27,6 +27,13 @@ vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) {
   return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
 }
 
+static inline rpc_tensor *
+vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) {
+  size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors;
+
+  return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size);
+}
+
 /* ggml_tensor */
 
 static inline void
@@ -120,37 +127,30 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id
 /* ggml_cgraph */
 
 static inline size_t
-vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) {
-  /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */
-  size_t size = 0;
+vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector<uint8_t> & cgraph_data) {
+  serialize_graph(cgraph, cgraph_data);
 
-  // don't include the `ggml_cgraph`, only it's data
-
-  // include the array of tensors
-  size += sizeof(ggml_tensor*) * cgraph->n_nodes;
-
-  // include the size of all the tensors
-  for (int i = 0; i < cgraph->n_nodes; i++) {
-    size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA);
-  }
-  INFO("SIZEOF(cgraph) --> %lu", size);
-  return size;
+  return cgraph_data.size();
 }
 
 static inline void
-vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) {
-  UNUSED(enc);
-  UNUSED(cgraph);
-  UNUSED(secondary_enc);
+vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector<uint8_t> & cgraph_data) {
+  size_t cgraph_size = cgraph_data.size();
+
+  vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size);
 }
 
 static inline ggml_cgraph *
-vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) {
-  // it safe to remove the `const` qualifier here, we *do* want to
-  // modify the shared memory data to fix the `src` pointers.
+vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) {
+  UNUSED(cgraph_size);
+
+  uint32_t n_nodes;
+  vn_decode_uint32_t(dec, &n_nodes);
+  const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes);
 
-  UNUSED(dec);
-  UNUSED(secondary_dec);
+  uint32_t n_tensors;
+  vn_decode_uint32_t(dec, &n_tensors);
+  const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors);
 
-  return NULL;
+  return deserialize_graph(n_nodes, n_tensors, tensors, nodes);
 }
diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
new file mode 100644
index 0000000000000..663160f48f061
--- /dev/null
+++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
@@ -0,0 +1,97 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "shared/venus_cs_ggml-rpc.h"
+
+std::unordered_set<ggml_backend_buffer_t> backend_buffers;
+
+void
+track_backend_buffer(ggml_backend_buffer_t buffer) {
+  backend_buffers.insert(buffer);
+}
+
+ggml_tensor *
+deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+  ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
+                                            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result->nb[i] = tensor->nb[i];
+  }
+  result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
+  if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) {
+    printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer);
+    result->buffer = nullptr;
+  }
+
+  if (result->buffer) {
+    // require that the tensor data does not go beyond the buffer end
+    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+  }
+
+  result->op = (ggml_op) tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result->op_params[i] = tensor->op_params[i];
+  }
+  result->flags = tensor->flags;
+  result->data = reinterpret_cast<void *>(tensor->data);
+  ggml_set_name(result, tensor->name);
+  return result;
+}
+
+ggml_tensor *
+create_node(uint64_t id,
+            struct ggml_context * ctx,
+            const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
+            std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
+  if (id == 0) {
+    return nullptr;
+  }
+  if (tensor_map.find(id) != tensor_map.end()) {
+    return tensor_map[id];
+  }
+  const rpc_tensor * tensor = tensor_ptrs.at(id);
+  struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
+  if (result == nullptr) {
+    return nullptr;
+  }
+  tensor_map[id] = result;
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+  }
+  result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+  result->view_offs = tensor->view_offs;
+  return result;
+}
+
+ggml_cgraph *
+deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) {
+  size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+  struct ggml_init_params params = {
+    /*.mem_size   =*/ buf_size,
+    /*.mem_buffer =*/ NULL,
+    /*.no_alloc   =*/ true,
+  };
+  struct ggml_context * ctx = ggml_init(params);
+  struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
+  graph->n_nodes = n_nodes;
+  std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
+  for (uint32_t i = 0; i < n_tensors; i++) {
+    tensor_ptrs[tensors[i].id] = &tensors[i];
+  }
+  std::unordered_map<uint64_t, ggml_tensor*> tensor_map;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    int64_t id;
+    memcpy(&id, &nodes[i], sizeof(id));
+    graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+  }
+
+  return graph;
+}
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index e0b305fc26c3f..15b338f730176 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -19,7 +19,7 @@ ggml_add_backend_library(ggml-remotingfrontend
                          virtgpu-forward-backend.cpp
                          virtgpu-forward-impl.h
                          ../../include/ggml-remoting-frontend.h
-                         ../ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp
+                         venus_cs_ggml-rpc-front.cpp
                         )
 
 target_link_libraries(ggml-remotingfrontend PUBLIC drm)
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 631db50b309cc..6343ce50b88a3 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -16,12 +16,14 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   context->gpu = gpu;
   context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size);
 
-  return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+
+  return buffer;
 }
 
 static const char *
 ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-  //IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
@@ -59,7 +61,7 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
   /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
   /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
   /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-  /* .is_host          = */ ggml_backend_remoting_buffer_type_is_host,
+  /* .is_host          = */ NULL,
 };
 
 /****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 6d51643962d80..e1faad1a1f7fd 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -157,7 +157,7 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
   /* .get_props            = */ ggml_backend_remoting_device_get_props,
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+  /* .get_host_buffer_type = */ NULL,
   /* .buffer_from_host_ptr = */ NULL,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 05383ff99f0a5..dacf0e3f1a597 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -3,7 +3,7 @@
 static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
   UNUSED(backend);
 
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   return "API Remoting backend";
 }
@@ -20,9 +20,7 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g
 
   struct virtgpu *gpu = DEV_TO_GPU(backend->device);
 
-  IMPLEMENTED;
-
-  STOP_HERE;
+  IMPLEMENTED_ONCE;
 
   return apir_backend_graph_compute(gpu, cgraph);
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 6a8bf2ea75713..1bb004a7cc961 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -38,6 +38,9 @@
 #define STOP_HERE \
   thks_bye()
 
+#define BREAKPOINT \
+  breakpoint()
+
 #define IMPLEMENTED							\
   printf("INFO: ### reached implemented function %s\n", __func__)
 
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
new file mode 100644
index 0000000000000..d9b43f0222705
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -0,0 +1,84 @@
+#include <vector>
+#include <unordered_set>
+#include <unordered_map>
+#include <cinttypes>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h"
+
+#include "ggml-remoting.h"
+
+rpc_tensor
+serialize_tensor(const ggml_tensor * tensor) {
+  rpc_tensor result;
+  result.id = reinterpret_cast<uint64_t>(tensor);
+  result.type = tensor->type;
+  if (tensor->buffer) {
+    ggml_backend_buffer_t buffer = tensor->buffer;
+
+    result.buffer = BUFFER_TO_HANDLE(buffer);
+    if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) {
+      INFO("pass buffer handle %p", result.buffer);
+      BREAKPOINT;
+    }
+  } else {
+    result.buffer = 0;
+  }
+  for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
+    result.ne[i] = tensor->ne[i];
+    result.nb[i] = tensor->nb[i];
+  }
+  result.op = tensor->op;
+  for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
+    result.op_params[i] = tensor->op_params[i];
+  }
+  result.flags = tensor->flags;
+  for (uint32_t i = 0; i < GGML_MAX_SRC; i++) {
+    result.src[i] = reinterpret_cast<uint64_t>(tensor->src[i]);
+  }
+  result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
+  result.view_offs = tensor->view_offs;
+  result.data = reinterpret_cast<uint64_t>(tensor->data);
+  snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
+  return result;
+}
+
+void
+add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited) {
+  if (tensor == nullptr) {
+    return;
+  }
+  if (visited.find(tensor) != visited.end()) {
+    return;
+  }
+  visited.insert(tensor);
+  for (int i = 0; i < GGML_MAX_SRC; i++) {
+    add_tensor(tensor->src[i], tensors, visited);
+  }
+  add_tensor(tensor->view_src, tensors, visited);
+  tensors.push_back(serialize_tensor(tensor));
+}
+
+void
+serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output) {
+  uint32_t n_nodes = cgraph->n_nodes;
+  std::vector<rpc_tensor> tensors;
+  std::unordered_set<ggml_tensor*> visited;
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    add_tensor(cgraph->nodes[i], tensors, visited);
+  }
+  // serialization format:
+  // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
+  uint32_t n_tensors = tensors.size();
+  int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor);
+  output.resize(output_size, 0);
+  memcpy(output.data(), &n_nodes, sizeof(n_nodes));
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t));
+  }
+  uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t));
+  *out_ntensors = n_tensors;
+  rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t));
+  memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index 8d18c18f8e2b5..51399edfd1dbc 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -9,26 +9,28 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE);
 
-  size_t size = vn_encode_sizeof_ggml_cgraph_data(cgraph);
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+  std::vector<uint8_t> cgraph_data;
+  size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data);
+
+  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, cgraph_size);
   if (!shmem) {
-    FATAL("Couldn't allocate the guest-host shared buffer :/");
+    FATAL("Couldn't allocate the guest-host shared buffer for passing the cgraph :/");
   }
-  INFO("Send shmem ID %d", shmem->res_id);
+  //INFO("Send shmem ID %d", shmem->res_id);
   vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
-  INFO("Send shmem size %lu", size);
-  vn_encode_size_t(encoder, &size);
+  //INFO("Send shmem size %lu", cgraph_size);
+  vn_encode_size_t(encoder, &cgraph_size);
 
   char *shmem_data = (char *) shmem->mmap_ptr;
-  struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, size);
+  struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size);
 
-  vn_encode_ggml_cgraph(encoder, cgraph, &secondary_enc);
+  vn_encode_cgraph_data(&secondary_enc, cgraph_data);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
   ggml_status status = GGML_STATUS_ABORTED;
   vn_decode_ggml_status(decoder, &status);
-  INFO("Received status %u", status);
+  //INFO("Received status %u", status);
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
index cedd31ddaaf9c..833f0e4680103 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp
@@ -193,3 +193,8 @@ void thks_bye () {
     exit(0);
   }
 }
+
+void breakpoint() {
+  // break here
+  INFO("breakpoint here :)");
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index a6bd5df92ea6f..77a79ebb029ca 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -29,6 +29,7 @@
 #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
 
 void thks_bye();
+void breakpoint();
 
 inline void
 INFO(const char *format, ...) {

From c927b34323764635c613778abc4d38056d07e2b4 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 21 May 2025 16:52:33 +0200
Subject: [PATCH 076/117] remoting: implement the free_buffer function

---
 .../backend-dispatched-buffer.cpp                 | 13 +++++++++++++
 .../src/ggml-remotingbackend/backend-dispatched.h |  3 +++
 .../ggml-remotingbackend/shared/apir_backend.h    |  5 +++--
 .../ggml-remotingfrontend/ggml-backend-buffer.cpp |  6 ++++--
 .../virtgpu-forward-buffer.cpp                    | 15 +++++++++++++++
 ggml/src/ggml-remotingfrontend/virtgpu-forward.h  |  1 +
 6 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index 70d86677d15b1..782391f8ae4c1 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -114,3 +114,16 @@ backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struc
 
   return 0;
 }
+
+uint32_t
+backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(enc);
+
+  ggml_backend_buffer_t buffer;
+  buffer = vn_decode_ggml_buffer(dec);
+
+  buffer->iface.free_buffer(buffer);
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index 5464f56baf152..d8d86fc3f67f5 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -40,6 +40,7 @@ uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder
 uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* backend */
 uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
@@ -69,6 +70,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor";
   case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor";
   case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear";
+  case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer";
 
   /* backend */
   case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute";
@@ -99,6 +101,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor,
   [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor,
   [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear,
+  [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer,
 
   /* backend */
   [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute,
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 14b0c21240547..1f39d063f8468 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -52,12 +52,13 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14,
   APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15,
   APIR_COMMAND_TYPE_BUFFER_CLEAR = 16,
+  APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 17,
 
   /* backend */
-  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 17,
+  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 18,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 18,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 19,
 } ApirBackendCommandType;
 
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index ed2c749958279..5a5ead36a762e 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -76,9 +76,11 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin
 static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) {
   UNUSED(buffer);
 
-  NOT_IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
-  STOP_HERE;
+  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+
+  apir_buffer_free_buffer(gpu, BUFFER_TO_HANDLE(buffer));
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 1a95f2f4721e5..83f402bdd0dd4 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -99,3 +99,18 @@ apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 }
+
+
+void
+apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
+
+  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index d59cd754eb803..15885dfc12304 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -35,6 +35,7 @@ void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_han
 			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
 void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 		       uint8_t value);
+void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle);
 
 /* backend */
 

From f29aa560e9b81c066889eb08546e786b2f3dd9a2 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 21 May 2025 17:13:53 +0200
Subject: [PATCH 077/117] remoting: highlight the hot path

---
 .../src/ggml-remotingfrontend/ggml-backend-device.cpp | 11 ++++++++---
 ggml/src/ggml-remotingfrontend/ggml-backend.cpp       |  2 +-
 ggml/src/ggml-remotingfrontend/ggml-remoting.h        |  2 +-
 run.remoting.sh                                       |  3 ++-
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index e1faad1a1f7fd..67294fcfdd5de 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -38,16 +38,21 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
 
 static bool
 ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  IMPLEMENTED_ONCE;
+#if 1
+  UNUSED(dev);
+  UNUSED(op);
 
+  return true; // same as ggml-rpc
+#else
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_supports_op(gpu, op);
+#endif
 }
 
 static bool
 ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-  IMPLEMENTED_ONCE;
+  //IMPLEMENTED_ONCE;
 
 #if 1
   bool supported = buft->device == dev;
@@ -66,7 +71,7 @@ ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_
 
 static bool
 ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-  IMPLEMENTED_ONCE;
+  //IMPLEMENTED_ONCE;
 
   UNUSED(dev);
   UNUSED(op);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index dacf0e3f1a597..e4be758af84b3 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -3,7 +3,7 @@
 static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) {
   UNUSED(backend);
 
-  IMPLEMENTED_ONCE;
+  //IMPLEMENTED_ONCE;
 
   return "API Remoting backend";
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 1bb004a7cc961..f5f51335563ea 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -49,7 +49,7 @@
     static bool first = true;						\
     if (first) {							\
       printf("INFO: ### reached implemented function %s\n", __func__);  \
-      first = false;							\
+      first = true;							\
     }									\
   } while(0)
 
diff --git a/run.remoting.sh b/run.remoting.sh
index b7175a78aab4c..00253e8f818f1 100755
--- a/run.remoting.sh
+++ b/run.remoting.sh
@@ -7,7 +7,8 @@ else
 fi
 
 MODEL="$HOME/models/llama3.2"
-PROMPT="say nothing"
+#PROMPT="say nothing"
+PROMPT="tell what's Apple metal API"
 $prefix \
     ../build.remoting-frontend/bin/llama-run \
     --ngl 99 \

From 14f327ab1489c624920b52083ea5c1d302073023 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 22 May 2025 09:45:51 +0200
Subject: [PATCH 078/117] remoting: fix the warnings and mute the debug logs
 when not in debug mode

---
 ggml/src/ggml-remotingbackend/shared/venus_cs.h      |  9 +++++++++
 .../ggml-remotingfrontend/ggml-backend-buffer.cpp    |  2 +-
 .../ggml-backend-host-buffer-type.cpp                |  2 +-
 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp  |  2 +-
 ggml/src/ggml-remotingfrontend/ggml-remoting.h       | 12 +++++++++++-
 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp       |  3 +++
 ggml/src/ggml-remotingfrontend/virtgpu-utils.h       |  5 +++++
 ggml/src/ggml-remotingfrontend/virtgpu.cpp           |  4 ++++
 8 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 93af8fd287c81..2c8723fbfe1a6 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -165,6 +165,9 @@ static inline size_t
 vn_sizeof_uint64_t(const uint64_t *val)
 {
   assert(sizeof(*val) == 8);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
   return 8;
 }
 
@@ -217,6 +220,9 @@ static inline size_t
 vn_sizeof_int32_t(const int32_t *val)
 {
   assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
   return 4;
 }
 
@@ -327,6 +333,9 @@ static inline size_t
 vn_sizeof_uint32_t(const uint32_t *val)
 {
   assert(sizeof(*val) == 4);
+#ifdef NDEBUG
+  UNUSED(val);
+#endif
   return 4;
 }
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 5a5ead36a762e..1f2db27c6c472 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -64,7 +64,7 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer
 }
 
 static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
index a355e9aebbbf9..20159faf3cae9 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -16,7 +16,7 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
   }
   struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
 
-  struct vn_renderer_shmem *shmem;
+  struct vn_renderer_shmem *shmem = nullptr;
   size_t index;
 
   for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index ca98528fd7e08..055c9b0e10dbb 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -124,7 +124,7 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
     /* .context     = */ gpu,
   };
 
-  RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:");
+  INFO("ggml_backend_remoting_frontend_reg() hello :wave:");
 
   ggml_backend_remoting_reg_init_devices(&reg);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index f5f51335563ea..e13d16b4ad799 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -41,17 +41,27 @@
 #define BREAKPOINT \
   breakpoint()
 
+#ifndef NDEBUG
 #define IMPLEMENTED							\
   printf("INFO: ### reached implemented function %s\n", __func__)
+#else
+#define IMPLEMENTED							\
+  do {} while(0)
+#endif
 
+#ifndef NDEBUG
 #define IMPLEMENTED_ONCE						\
   do {									\
     static bool first = true;						\
     if (first) {							\
       printf("INFO: ### reached implemented function %s\n", __func__);  \
-      first = true;							\
+      first = false;							\
     }									\
   } while(0)
+#else
+#define IMPLEMENTED_ONCE			\
+  do {} while(0)
+#endif
 
 #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
index 935b1028d2ab0..617702b8eca0c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -43,6 +43,9 @@ virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle)
 
    const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args);
    assert(!ret);
+#ifdef NDEBUG
+   UNUSED(ret);
+#endif
 }
 
 static void *
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
index 77a79ebb029ca..6b69ebc6329ca 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h
@@ -31,6 +31,7 @@
 void thks_bye();
 void breakpoint();
 
+#ifndef NDEBUG
 inline void
 INFO(const char *format, ...) {
   va_list argptr;
@@ -39,6 +40,10 @@ INFO(const char *format, ...) {
   fprintf(stderr, "\n");
   va_end(argptr);
 }
+#else
+inline void
+INFO(...) {}
+#endif
 
 inline void
 WARNING(const char *format, ...) {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 58d70ddda28ff..be54353ed3b1c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -71,6 +71,10 @@ create_virtgpu() {
   result = virtgpu_init_context(gpu);
   assert(result == APIR_SUCCESS);
 
+#ifdef NDEBUG
+   UNUSED(result);
+#endif
+
   virtgpu_init_shmem_blob_mem(gpu);
 
   gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384);

From e80e48056c75e7ba128072fd56c7fcf7ac51f0ef Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 22 May 2025 09:46:13 +0200
Subject: [PATCH 079/117] scripts: make it easier to build and run in prod mode

---
 build.backend.sh                              | 17 +++++-
 .../src/ggml-remotingfrontend/virtgpu-shm.cpp |  4 +-
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    |  3 +-
 prepare.backend.sh                            |  5 +-
 run.remoting.sh                               | 55 +++++++++++++++----
 5 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/build.backend.sh b/build.backend.sh
index 086f7a4577ddd..863f98e3524a3 100755
--- a/build.backend.sh
+++ b/build.backend.sh
@@ -4,7 +4,22 @@ rm -f READY_backend FAILED_backend
 echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc -
 export LD_PRELOAD=/tmp/isatty.so
 
-cmake --build ../build.remoting-backend --parallel 8 --target llama-run "$@"
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+if [[ "$FLAVOR" == "-prod" ]]; then
+    cat <<EOF
+###
+### Building the prod flavor
+###
+EOF
+fi
+
+WHAT="llama-run llama-bench"
+cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $WHAT "$@"
 
 if [[ $? == 0 ]]; then
     touch READY_backend
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
index 617702b8eca0c..a09fd22371a8c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp
@@ -72,8 +72,8 @@ void
 virtgpu_shmem_destroy(struct virtgpu *gpu,
                       struct virtgpu_shmem *shmem)
 {
-   munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
-   virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
+  munmap(shmem->base.mmap_ptr, shmem->base.mmap_size);
+  virtgpu_ioctl_gem_close(gpu, shmem->gem_handle);
 }
 
 struct vn_renderer_shmem *
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index be54353ed3b1c..b595bb735a9f9 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -208,8 +208,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
 
    drmFreeVersion(version);
 
-   if (VN_DEBUG(INIT))
-      vn_log(gpu->instance, "using DRM device %s", node_path);
+   INFO(gpu->instance, "using DRM device %s", node_path);
 
    return APIR_SUCCESS;
 }
diff --git a/prepare.backend.sh b/prepare.backend.sh
index a51f2465b6733..76e30fe31cfa4 100755
--- a/prepare.backend.sh
+++ b/prepare.backend.sh
@@ -1,5 +1,6 @@
-cmake -S . -B ../build.remoting-backend \
+cmake -S . -B ../build.remoting-backend-prod \
       -DGGML_REMOTINGBACKEND=ON \
       -DGGML_NATIVE=OFF \
-      -DCMAKE_BUILD_TYPE=Debug \
       "$@"
+
+#      -DCMAKE_BUILD_TYPE=Debug \
diff --git a/run.remoting.sh b/run.remoting.sh
index 00253e8f818f1..9a2a77f054210 100755
--- a/run.remoting.sh
+++ b/run.remoting.sh
@@ -1,17 +1,52 @@
 #! /bin/bash
-clear
+#clear
 if [[ ${1:-} == "gdb" ]]; then
     prefix="gdb --args"
 else
     prefix=""
 fi
 
-MODEL="$HOME/models/llama3.2"
-#PROMPT="say nothing"
-PROMPT="tell what's Apple metal API"
-$prefix \
-    ../build.remoting-frontend/bin/llama-run \
-    --ngl 99 \
-    --verbose \
-    "$MODEL" \
-    "$PROMPT"
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+MODEL=${MODEL:-llama3.2}
+
+if [[ "$FLAVOR" == "-prod" ]]; then
+    cat <<EOF
+###
+### Running the prod flavor
+###
+
+EOF
+fi
+
+if [[ "${BENCH_MODE:-}" ]]; then
+    bench=yes
+else
+    bench=no
+fi
+
+LLAMA_BUILD_DIR=../build.remoting-frontend$FLAVOR
+
+MODEL_HOME="$HOME/models"
+
+set -x
+if [[ "$bench" == yes ]]; then
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-bench \
+        --model "$MODEL_HOME/$MODEL" \
+        --n-gpu-layers 99
+else
+    PROMPT="say nothing"
+    PROMPT="tell what's Apple metal API"
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/llama-run \
+        --ngl 99 \
+        --verbose \
+        --context-size 4096 \
+        "$MODEL_HOME/$MODEL" \
+        "$PROMPT"
+fi

From 31c0ce7c8f7c9809a382496585d7a5b5380b422a Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 22 May 2025 11:27:53 +0200
Subject: [PATCH 080/117] remotingfrontend: always prepare a shared memory for
 data

---
 .../virtgpu-forward-backend.cpp               | 18 +++++++---
 .../virtgpu-forward-buffer.cpp                | 33 ++++++++++++++-----
 ggml/src/ggml-remotingfrontend/virtgpu.cpp    | 11 +++++--
 ggml/src/ggml-remotingfrontend/virtgpu.h      |  1 +
 4 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index 51399edfd1dbc..61c7fc7ac9839 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -12,10 +12,17 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
   std::vector<uint8_t> cgraph_data;
   size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data);
 
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, cgraph_size);
-  if (!shmem) {
-    FATAL("Couldn't allocate the guest-host shared buffer for passing the cgraph :/");
+  struct vn_renderer_shmem *shmem;
+  if (cgraph_size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, cgraph_size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
   }
+
   //INFO("Send shmem ID %d", shmem->res_id);
   vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
   //INFO("Send shmem size %lu", cgraph_size);
@@ -34,7 +41,8 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  virtgpu_shmem_destroy(gpu, shmem->shmem);
-
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
   return status;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 83f402bdd0dd4..f7c88a3634e87 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -36,9 +36,15 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
   vn_encode_ggml_tensor(encoder, tensor);
 
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
-  if (!shmem) {
-    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
   }
 
   memcpy(shmem->mmap_ptr, data, size);
@@ -51,7 +57,9 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  virtgpu_shmem_destroy(gpu, shmem->shmem);
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
 
   return;
 }
@@ -67,10 +75,17 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
   vn_encode_ggml_tensor(encoder, tensor);
 
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
-  if (!shmem) {
-    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  struct vn_renderer_shmem *shmem;
+  if (size > gpu->data_shmem->mmap_size) {
+    shmem = virtgpu_shmem_create(gpu, size);
+    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    if (!shmem) {
+      FATAL("Couldn't allocate the guest-host shared buffer :/");
+    }
+  } else {
+    shmem = gpu->data_shmem;
   }
+
   vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id);
   vn_encode_size_t(encoder, &offset);
   vn_encode_size_t(encoder, &size);
@@ -81,7 +96,9 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  virtgpu_shmem_destroy(gpu, shmem->shmem);
+  if (shmem != gpu->data_shmem) {
+    virtgpu_shmem_destroy(gpu, shmem->shmem);
+  }
 }
 
 void
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index b595bb735a9f9..ec9813815cc90 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -77,10 +77,15 @@ create_virtgpu() {
 
   virtgpu_init_shmem_blob_mem(gpu);
 
-  gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384);
+  gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000);
+  gpu->data_shmem = virtgpu_shmem_create(gpu, 0x13b0000); // 19MiB
 
   if (!gpu->reply_shmem) {
-    FATAL("%s: failed to create the reply shared memory page :/", __func__);
+    FATAL("%s: failed to create the shared reply memory pages :/", __func__);
+  }
+
+  if (!gpu->data_shmem) {
+    FATAL("%s: failed to create the shared data memory pages :/", __func__);
   }
 
   struct vn_cs_encoder *encoder;
@@ -208,7 +213,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev)
 
    drmFreeVersion(version);
 
-   INFO(gpu->instance, "using DRM device %s", node_path);
+   INFO("using DRM device %s", node_path);
 
    return APIR_SUCCESS;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 5ab934ec7fb78..26933c8a6eda4 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -88,6 +88,7 @@ struct virtgpu {
 
   /* KP */
   struct vn_renderer_shmem *reply_shmem;
+  struct vn_renderer_shmem *data_shmem;
 };
 
 

From 03935ee27c0cfc0bf0ca1a40bbe86939ee1e01f5 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 22 May 2025 16:40:21 +0200
Subject: [PATCH 081/117] remoting: release device buffers on exit

---
 .../backend-dispatched-buffer-type.cpp           |  1 +
 .../backend-dispatched-buffer.cpp                |  5 +++++
 .../ggml-remotingbackend/backend-dispatched.cpp  |  4 ++++
 ggml/src/ggml-remotingbackend/backend.cpp        | 10 ++++++++++
 .../ggml-remotingbackend/shared/apir_backend.h   |  2 +-
 .../shared/venus_cs_ggml-rpc.h                   |  2 ++
 .../venus_cs_ggml-rpc-back.cpp                   | 16 ++++++++++++++++
 .../virtgpu-forward-buffer.cpp                   |  2 +-
 ggml/src/ggml-remotingfrontend/virtgpu.cpp       |  2 +-
 9 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 8c3349a367dfc..a796e9c1114a7 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -66,6 +66,7 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
   size_t size;
   vn_decode_size_t(dec, &size);
 
+  WARNING("NEED TO ALLOCATE FROM PTR INSTEAD");
   ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size);
   apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer;
   vn_encode_ggml_buffer_handle(enc, buffer_handle);
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index 782391f8ae4c1..ea9f31ad1a634 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -123,6 +123,11 @@ backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   ggml_backend_buffer_t buffer;
   buffer = vn_decode_ggml_buffer(dec);
 
+  if (!untrack_backend_buffer(buffer)) {
+    WARNING("%s: unknown buffer %p", (void *) buffer);
+    return 1;
+  }
+
   buffer->iface.free_buffer(buffer);
 
   return 0;
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index 73be488e6c0f7..6781e108200c2 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -35,5 +35,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
     return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
   }
 
+  size_t free, total;
+  dev->iface.get_memory(dev, &free, &total);
+  WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
+
   return APIR_BACKEND_INITIALIZE_SUCCESSS;
 }
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index c9d784941d514..22a60681d4447 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -18,6 +18,16 @@ static void *backend_library_handle = NULL;
 
 extern "C" {
   void apir_backend_deinit(void) {
+    auto buffers = get_track_backend_buffers();
+    for (const auto& buffer: buffers) {
+      untrack_backend_buffer(buffer);
+      buffer->iface.free_buffer(buffer);
+    }
+
+    size_t free, total;
+    dev->iface.get_memory(dev, &free, &total);
+    WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
+
     if (backend_library_handle) {
       INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
       dlclose(backend_library_handle);
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 1f39d063f8468..8ab79b4cbe39f 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-remotingbackend.dylib"
+#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend-prod/bin/libggml-remotingbackend.dylib"
 #define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize"
 #define APIR_DEINIT_FCT_NAME "apir_backend_deinit"
 #define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher"
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
index a50405a479221..96402287af7fc 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h
@@ -30,6 +30,8 @@ void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & output);
 /* backend */
 
 void track_backend_buffer(ggml_backend_buffer_t buffer);
+bool untrack_backend_buffer(ggml_backend_buffer_t buffer);
+std::unordered_set<ggml_backend_buffer_t> get_track_backend_buffers();
 
 void add_tensor(ggml_tensor * tensor, std::vector<rpc_tensor> & tensors, std::unordered_set<ggml_tensor*> & visited);
 
diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
index 663160f48f061..58a142ae93d5b 100644
--- a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
+++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
@@ -14,6 +14,22 @@ track_backend_buffer(ggml_backend_buffer_t buffer) {
   backend_buffers.insert(buffer);
 }
 
+bool
+untrack_backend_buffer(ggml_backend_buffer_t buffer) {
+  auto it = backend_buffers.find(buffer);
+  if (it == backend_buffers.end()) {
+    return false;
+  }
+
+  backend_buffers.erase(it);
+  return true;
+}
+
+std::unordered_set<ggml_backend_buffer_t>
+get_track_backend_buffers() {
+  return backend_buffers;
+}
+
 ggml_tensor *
 deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
   ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index f7c88a3634e87..7452dd48ad4ea 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -39,7 +39,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   struct vn_renderer_shmem *shmem;
   if (size > gpu->data_shmem->mmap_size) {
     shmem = virtgpu_shmem_create(gpu, size);
-    WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+    //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
     if (!shmem) {
       FATAL("Couldn't allocate the guest-host shared buffer :/");
     }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index ec9813815cc90..39ed7b3a99f95 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -78,7 +78,7 @@ create_virtgpu() {
   virtgpu_init_shmem_blob_mem(gpu);
 
   gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000);
-  gpu->data_shmem = virtgpu_shmem_create(gpu, 0x13b0000); // 19MiB
+  gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB
 
   if (!gpu->reply_shmem) {
     FATAL("%s: failed to create the shared reply memory pages :/", __func__);

From 67d405d73da1d24d652e7d50ffb62d92381ea3d1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 26 May 2025 10:02:52 +0200
Subject: [PATCH 082/117] remoting: refactor the buffer context

---
 .../ggml-remotingbackend/backend-convert.h    |  6 ++-
 .../backend-dispatched-buffer-type.cpp        | 26 ++++++++++---
 .../shared/apir_backend.h                     | 11 +++++-
 .../ggml-remotingbackend/shared/venus_cs.h    | 10 ++---
 .../shared/venus_cs_ggml.h                    |  9 +++--
 .../ggml-backend-buffer-type.cpp              |  2 +-
 .../ggml-backend-buffer.cpp                   | 10 ++---
 .../src/ggml-remotingfrontend/ggml-remoting.h | 18 +++------
 .../venus_cs_ggml-rpc-front.cpp               |  2 +-
 .../virtgpu-forward-buffer-type.cpp           | 31 ++++++++++-----
 .../virtgpu-forward-buffer.cpp                | 38 +++++++++++++------
 .../ggml-remotingfrontend/virtgpu-forward.h   | 14 +++----
 12 files changed, 112 insertions(+), 65 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
index e7d875cde7ee8..4b56a222f02da 100644
--- a/ggml/src/ggml-remotingbackend/backend-convert.h
+++ b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -1,7 +1,9 @@
 #include "shared/apir_backend.h"
 
-static inline apir_buffer_handle_t
+#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name)
+
+static inline apir_buffer_host_handle_t
 ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
   // in the backend, the buffer handle is the buffer pointer
-  return (apir_buffer_handle_t) buffer;
+  return (apir_buffer_host_handle_t) buffer;
 }
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index a796e9c1114a7..0f577da1f7711 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -60,16 +60,32 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
 uint32_t
 backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
+#if APIR_ALLOC_FROM_HOST_PTR
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_data) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+#else
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buft(dec);
-
+#endif
   size_t size;
   vn_decode_size_t(dec, &size);
 
-  WARNING("NEED TO ALLOCATE FROM PTR INSTEAD");
-  ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size);
-  apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer;
-  vn_encode_ggml_buffer_handle(enc, buffer_handle);
+  ggml_backend_buffer_t buffer;
+#if APIR_ALLOC_FROM_HOST_PTR
+  WARNING("USING FROM_HOST_PTR\n\n");
+  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, size);
+#else
+  WARNING("USING ALLOC_BUFFER");
+  buffer = buft->iface.alloc_buffer(buft, size);
+  WARNING("USING ALLOC_BUFFER--> %p", buffer);
+#endif
+  
+  vn_encode_ggml_buffer(enc, buffer);
 
   if (buffer) {
     track_backend_buffer(buffer);
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 8ab79b4cbe39f..c9d1b71af95b9 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,8 +14,17 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
+#define APIR_ALLOC_FROM_HOST_PTR 0
+
 typedef uintptr_t apir_buffer_type_handle_t;
-typedef uintptr_t apir_buffer_handle_t;
+typedef uintptr_t apir_buffer_host_handle_t;
+
+typedef struct {
+  apir_buffer_host_handle_t host_handle;
+#if APIR_ALLOC_FROM_HOST_PTR
+  struct vn_renderer_shmem *shmem;
+#endif
+} apir_buffer_context_t;
 
 typedef uint32_t (*apir_backend_initialize_t)(void);
 typedef void (*apir_backend_deinit_t)(void);
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 2c8723fbfe1a6..81dd5b8fb17ca 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -525,18 +525,18 @@ vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_
   vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
 }
 
-/* apir_buffer_handle_t */
+/* apir_buffer_host_handle_t */
 
 static inline void
-vn_encode_apir_buffer_handle_t(struct vn_cs_encoder *enc, const apir_buffer_handle_t *val)
+vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val)
 {
-  vn_encode(enc, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t));
+  vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
 }
 
 static inline void
-vn_decode_apir_buffer_handle_t(struct vn_cs_decoder *dec, apir_buffer_handle_t *val)
+vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val)
 {
-  vn_decode(dec, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t));
+  vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t));
 }
 
 /* uintptr_t */
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index c32ac91650e4d..e0844113d9eb0 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -4,10 +4,10 @@
 #include "venus_cs_ggml-rpc.h"
 
 // needs
-// ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer);
+// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer);
 
 static inline void
-vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle);
+vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle);
 
 static inline ggml_backend_buffer_t
 vn_decode_ggml_buffer(struct vn_cs_decoder *dec);
@@ -86,8 +86,9 @@ vn_decode_ggml_buft(struct vn_cs_decoder *dec) {
 // same logic as for ggml_backend_buffer_type_t
 
 static inline void
-vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle) {
-  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) {
+  apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
 }
 
 static inline ggml_backend_buffer_t
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 6343ce50b88a3..775238d501374 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -14,7 +14,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   }
 
   context->gpu = gpu;
-  context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size);
+  context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 1f2db27c6c472..99bdbdaca2275 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -8,7 +8,7 @@ static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  return apir_buffer_get_base(gpu, BUFFER_TO_HANDLE(buffer));
+  return apir_buffer_get_base(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -38,7 +38,7 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer
   }
   INFO("\n");
 #endif
-  apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size);
+  apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
 
   return;
 }
@@ -47,7 +47,7 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
   IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  apir_buffer_get_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size);
+  apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
 }
 
 
@@ -68,7 +68,7 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  apir_buffer_clear(gpu, BUFFER_TO_HANDLE(buffer), value);
+  apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value);
 
   return;
 }
@@ -80,7 +80,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
-  apir_buffer_free_buffer(gpu, BUFFER_TO_HANDLE(buffer));
+  apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index e13d16b4ad799..0ffee92f0ec8a 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -13,8 +13,11 @@
 #define DEV_TO_GPU(name) \
   ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
 
-#define BUFFER_TO_HANDLE(name) \
-  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle
+#define BUFFER_TO_APIR_CONTEXT(name) \
+  &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
+
+#define BUFFER_TO_HOST_HANDLE(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle
 
 #define GET_DEVICE_CONTEXT() \
   (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \
@@ -76,20 +79,11 @@ struct ggml_backend_remoting_device_context {
 };
 
 struct ggml_backend_remoting_buffer_context {
-  apir_buffer_handle_t handle;
+  apir_buffer_context_t apir_context;
 
   struct virtgpu *gpu;
 };
 
-static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
-  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
-
-  if (!context) {
-    return 0;
-  }
-  return context->handle;
-}
-
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
index d9b43f0222705..bc4b96b84f365 100644
--- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -17,7 +17,7 @@ serialize_tensor(const ggml_tensor * tensor) {
   if (tensor->buffer) {
     ggml_backend_buffer_t buffer = tensor->buffer;
 
-    result.buffer = BUFFER_TO_HANDLE(buffer);
+    result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
     if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) {
       INFO("pass buffer handle %p", result.buffer);
       BREAKPOINT;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index 645780715a133..f43c1851da797 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -8,7 +8,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft)
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
 
   apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_handle_t(encoder, &handle);
+  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -36,7 +36,7 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
 
   apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_handle_t(encoder, &handle);
+  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -58,7 +58,7 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
 
   apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_handle_t(encoder, &handle);
+  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -80,7 +80,7 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
 
   apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_handle_t(encoder, &handle);
+  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -94,26 +94,37 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
   return is_host;
 }
 
-apir_buffer_handle_t
+apir_buffer_context_t
 apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
+  apir_buffer_context_t buffer_context;
   INFO("%s: allocate device memory (%lu)", __func__,  size);
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
 
-  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_handle_t(encoder, &buft_handle);
+#if APIR_ALLOC_FROM_HOST_PTR
+  UNUSED(buft);
+  
+  buffer_context.shmem = virtgpu_shmem_create(gpu, size);
+  //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
+  if (!buffer_context.shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
 
+  vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
+#else
+  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context;
+  vn_encode_apir_buffer_type_handle_t(encoder, &buft_handle);
+#endif
   vn_encode_size_t(encoder, &size);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
-  apir_buffer_handle_t buffer_handle;
-  vn_decode_apir_buffer_handle_t(decoder, &buffer_handle);
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
-  return buffer_handle;
+  return buffer_context;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 7452dd48ad4ea..18b010583fa6d 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -1,13 +1,13 @@
 #include "virtgpu-forward-impl.h"
 
 void *
-apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
+apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE);
 
-  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -22,18 +22,18 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
 }
 
 void
-apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 		       ggml_tensor *tensor, const void *data, size_t offset, size_t size) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
 #if 0
   INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu",
-    buffer_handle, tensor, data, offset, size);
+       buffer_context->host_handle, tensor, data, offset, size);
 #endif
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR);
 
-  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
   vn_encode_ggml_tensor(encoder, tensor);
 
   struct vn_renderer_shmem *shmem;
@@ -64,15 +64,26 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
   return;
 }
 
+#if APIR_ALLOC_FROM_HOST_PTR
 void
-apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
+		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
+  UNUSED(gpu);
+  UNUSED(tensor);
+  char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr;
+
+  memcpy(data, buffer_base_addr+offset, size);
+}
+#else
+void
+apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR);
 
-  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
   vn_encode_ggml_tensor(encoder, tensor);
 
   struct vn_renderer_shmem *shmem;
@@ -100,16 +111,17 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
     virtgpu_shmem_destroy(gpu, shmem->shmem);
   }
 }
+#endif
 
 void
-apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 		  uint8_t value) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR);
 
-  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
   vn_encode_uint8_t(encoder, &value);
 
   REMOTE_CALL(gpu, encoder, decoder);
@@ -119,15 +131,17 @@ apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
 
 
 void
-apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) {
+apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER);
 
-  vn_encode_apir_buffer_handle_t(encoder, &buffer_handle);
+  vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
-
+#if APIR_ALLOC_FROM_HOST_PTR
+  virtgpu_shmem_destroy(gpu, buffer_context->shmem->shmem);
+#endif
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 15885dfc12304..0429c5b757a18 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -23,19 +23,19 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t
 size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
-apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size);
+apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size);
 
 /* buffer */
 
-void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle);
-enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor);
-void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
+enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor);
+void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 			    ggml_tensor *tensor, const void *data, size_t offset, size_t size);
-void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 			    const ggml_tensor *tensor, void *data, size_t offset, size_t size);
-void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle,
+void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 		       uint8_t value);
-void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle);
+void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context);
 
 /* backend */
 

From c5d44f95c00614fab4a79cd5c4d016ee7cdcdfa5 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 26 May 2025 11:40:49 +0200
Subject: [PATCH 083/117] remoting: exchange more data

---
 .../ggml-remotingbackend/backend-convert.h    |  6 ++++++
 .../backend-dispatched-buffer-type.cpp        | 15 +++++++------
 .../backend-dispatched-device.cpp             |  3 +--
 .../shared/apir_backend.h                     |  5 +++--
 .../ggml-remotingbackend/shared/venus_cs.h    | 10 ++++-----
 .../shared/venus_cs_ggml.h                    | 18 ++++++++++++----
 .../ggml-backend-device.cpp                   |  2 +-
 .../src/ggml-remotingfrontend/ggml-remoting.h |  6 ++++++
 .../virtgpu-forward-buffer-type.cpp           | 21 +++++++++----------
 .../virtgpu-forward-device.cpp                |  6 +++---
 .../ggml-remotingfrontend/virtgpu-forward.h   |  2 +-
 11 files changed, 59 insertions(+), 35 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h
index 4b56a222f02da..b45c2784160ac 100644
--- a/ggml/src/ggml-remotingbackend/backend-convert.h
+++ b/ggml/src/ggml-remotingbackend/backend-convert.h
@@ -7,3 +7,9 @@ ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
   // in the backend, the buffer handle is the buffer pointer
   return (apir_buffer_host_handle_t) buffer;
 }
+
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 0f577da1f7711..9ff2e79831f87 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -10,7 +10,7 @@ uint32_t
 backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
-  buft = vn_decode_ggml_buft(dec);
+  buft = vn_decode_ggml_buffer_type(dec);
 
   const char *string = buft->iface.get_name(buft);
 
@@ -25,7 +25,7 @@ uint32_t
 backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
-  buft = vn_decode_ggml_buft(dec);
+  buft = vn_decode_ggml_buffer_type(dec);
 
   size_t value = buft->iface.get_alignment(buft);
   vn_encode_size_t(enc, &value);
@@ -37,7 +37,7 @@ uint32_t
 backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
-  buft = vn_decode_ggml_buft(dec);
+  buft = vn_decode_ggml_buffer_type(dec);
 
   size_t value = buft->iface.get_max_size(buft);
   vn_encode_size_t(enc, &value);
@@ -49,7 +49,7 @@ uint32_t
 backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   ggml_backend_buffer_type_t buft;
-  buft = vn_decode_ggml_buft(dec);
+  buft = vn_decode_ggml_buffer_type(dec);
 
   bool is_host = buft->iface.is_host(buft);
   vn_encode_bool_t(enc, &is_host);
@@ -70,7 +70,7 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
   }
 #else
   ggml_backend_buffer_type_t buft;
-  buft = vn_decode_ggml_buft(dec);
+  buft = vn_decode_ggml_buffer_type(dec);
 #endif
   size_t size;
   vn_decode_size_t(dec, &size);
@@ -78,7 +78,10 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
   ggml_backend_buffer_t buffer;
 #if APIR_ALLOC_FROM_HOST_PTR
   WARNING("USING FROM_HOST_PTR\n\n");
-  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, size);
+  #define MAX_TENSOR_SIZE 323205120
+  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE);
+
+  vn_encode_ggml_buffer_type(enc, buffer->buft);
 #else
   WARNING("USING ALLOC_BUFFER");
   buffer = buft->iface.alloc_buffer(buft, size);
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 863c2698779e7..18f0e0a81b6a6 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -89,8 +89,7 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *
 
   ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev);
 
-  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) bufft;
-  vn_encode_apir_buffer_type_handle_t(enc, &buft_handle);
+  vn_encode_ggml_buffer_type(enc, bufft);
 
   return 0;
 }
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index c9d1b71af95b9..6449ccc109146 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,15 +14,16 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
-#define APIR_ALLOC_FROM_HOST_PTR 0
+#define APIR_ALLOC_FROM_HOST_PTR 1
 
-typedef uintptr_t apir_buffer_type_handle_t;
+typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
 
 typedef struct {
   apir_buffer_host_handle_t host_handle;
 #if APIR_ALLOC_FROM_HOST_PTR
   struct vn_renderer_shmem *shmem;
+  apir_buffer_type_host_handle_t buft_host_handle;
 #endif
 } apir_buffer_context_t;
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
index 81dd5b8fb17ca..e67c99a46b5b6 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h
@@ -511,18 +511,18 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val)
   vn_decode(dec, sizeof(int), val, sizeof(bool));
 }
 
-/* apir_buffer_type_handle_t */
+/* apir_buffer_type_host_handle_t */
 
 static inline void
-vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_handle_t *val)
+vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val)
 {
-  vn_encode(enc, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
+  vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
 }
 
 static inline void
-vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_handle_t *val)
+vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val)
 {
-  vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t));
+  vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t));
 }
 
 /* apir_buffer_host_handle_t */
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index e0844113d9eb0..71e15f847e851 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -67,19 +67,29 @@ vn_decode_ggml_tensor(struct vn_cs_decoder *dec) {
 
 
 static inline void
-vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, apir_buffer_type_handle_t *handle) {
-  vn_cs_encoder_write(enc, sizeof(*handle), handle, sizeof(*handle));
+vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) {
+  apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft);
+  vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle));
 }
 
 static inline ggml_backend_buffer_type_t
-vn_decode_ggml_buft(struct vn_cs_decoder *dec) {
-  apir_buffer_type_handle_t handle;
+vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
 
   vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
 
   return (ggml_backend_buffer_type_t) handle;
 }
 
+static inline apir_buffer_type_host_handle_t
+vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) {
+  apir_buffer_type_host_handle_t handle;
+
+  vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle));
+
+  return handle;
+}
+
 /* *** ggml_backend_type_t *** */
 
 // ggml_backend_buffer_t is a POINTER.
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 67294fcfdd5de..092c05b9e43f3 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -118,7 +118,7 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
 
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
-  apir_buffer_type_handle_t ctx = apir_device_get_buffer_type(gpu);
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
 
   static struct ggml_backend_buffer_type buft {
     /* .iface    = */ ggml_backend_remoting_buffer_type_interface,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 0ffee92f0ec8a..71708e75a5d4a 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -22,6 +22,12 @@
 #define GET_DEVICE_CONTEXT() \
   (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \
 
+static inline apir_buffer_type_host_handle_t
+ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) {
+  // in the backend, the buffer handle is the buffer pointer
+  return (apir_buffer_type_host_handle_t) buft->context;
+}
+
 #define NOT_IMPLEMENTED							\
   do {									\
     static bool first = true;						\
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index f43c1851da797..4f7aac1360124 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -7,8 +7,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft)
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME);
 
-  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
+  vn_encode_ggml_buffer_type(encoder, buft);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -35,8 +34,7 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT);
 
-  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
+  vn_encode_ggml_buffer_type(encoder, buft);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -57,8 +55,7 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE);
 
-  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
+  vn_encode_ggml_buffer_type(encoder, buft);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -79,8 +76,7 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) {
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST);
 
-  apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_type_handle_t(encoder, &handle);
+  vn_encode_ggml_buffer_type(encoder, buft);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
@@ -115,15 +111,18 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
 #else
-  apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context;
-  vn_encode_apir_buffer_type_handle_t(encoder, &buft_handle);
+  vn_encode_ggml_buffer_type(encoder, buft);
 #endif
   vn_encode_size_t(encoder, &size);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
-  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+#if APIR_ALLOC_FROM_HOST_PTR
+  buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
+#endif
 
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+  
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return buffer_context;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 5ee2c01dd50ab..ffc6febf4cab0 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -160,7 +160,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
 #endif
 }
 
-apir_buffer_type_handle_t
+apir_buffer_type_host_handle_t
 apir_device_get_buffer_type(struct virtgpu *gpu) {
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
@@ -169,8 +169,8 @@ apir_device_get_buffer_type(struct virtgpu *gpu) {
 
   REMOTE_CALL(gpu, encoder, decoder);
 
-  apir_buffer_type_handle_t buft_handle;
-  vn_decode_apir_buffer_type_handle_t(decoder, &buft_handle);
+  apir_buffer_type_host_handle_t buft_handle;
+  vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle);
 
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 0429c5b757a18..1f03f8bf31725 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -11,7 +11,7 @@ const char *apir_device_get_description(struct virtgpu *gpu);
 uint32_t apir_device_get_type(struct virtgpu *gpu);
 void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total);
 bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op);
-apir_buffer_type_handle_t apir_device_get_buffer_type(struct virtgpu *gpu);
+apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu);
 void apir_device_get_props(struct virtgpu *gpu,
 			   bool *async,
 			   bool *host_buffer,

From 83596a25899800d89152b244c462d52cca172936 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 26 May 2025 11:41:27 +0200
Subject: [PATCH 084/117] podman_compile: pass the PERF_MODE flag to the
 container

---
 podman_compile.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/podman_compile.sh b/podman_compile.sh
index 4793b4ce20fa2..ec243f75ee89f 100755
--- a/podman_compile.sh
+++ b/podman_compile.sh
@@ -29,6 +29,7 @@ podman run \
 --cgroupns host \
 --security-opt label=disable \
 --env HOME="$HOME" \
+--env PERF_MODE="${PERF_MODE:-}" \
 -v "$HOME":"$HOME":Z \
 -w "$PWD" \
 -it --rm \

From 6b4bc18b295ee892a30228566392e33083a14644 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:06:05 +0200
Subject: [PATCH 085/117] examples: run: run: measure the generation throughput

---
 examples/run/run.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 68e94b0b3c3f8..7c830255d0cee 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -965,6 +965,36 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
     response += piece;
 }
 
+static long long timer_start = 0;
+static long long timer_total = 0;
+static long long timer_count = 0;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  //printe("[%15lld] ns\n", timer_total);
+  long long ms = timer_total/1000000;
+  long long itl = ms/timer_count;
+  float speed = 1/((float)itl) * 1000;
+  printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
+
+  printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000);
+}
+
+
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
     const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
@@ -974,10 +1004,15 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         return 1;
     }
 
+      int cr = atexit(show_timer);
+      assert(cr == 0);
+
     // prepare a batch for the prompt
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
+
     while (true) {
+        start_timer();
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
             printe("failed to decode\n");
@@ -999,6 +1034,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
 
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
+	stop_timer();
     }
 
     printf(LOG_COL_DEFAULT);

From 9ab699dbd803af48ad3d54457737be3e9ecf6bd1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:07:02 +0200
Subject: [PATCH 086/117] examples: run: run: stop after 25 tokens

---
 examples/run/run.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 7c830255d0cee..42db6ef659980 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -1011,7 +1011,14 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
 
+    int count = 0;
     while (true) {
+#if 0
+      if (count > 25) {
+	printe("WARNING: stopping after %d tokens", count);
+	break;
+      }
+#endif
         start_timer();
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
@@ -1035,6 +1042,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
 	stop_timer();
+	count += 1;
     }
 
     printf(LOG_COL_DEFAULT);

From da8bdd487e4e054fe77fdd228f584c557efc7ee8 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:08:50 +0200
Subject: [PATCH 087/117] remoting: add basic timing measurements

---
 .../backend-dispatched-backend.cpp            |  6 ++++
 .../backend-dispatched.cpp                    |  4 +++
 ggml/src/ggml-remotingbackend/backend.cpp     |  9 ++++++
 .../shared/apir_backend.h                     | 31 ++++++++++++++++---
 .../ggml-backend-reg.cpp                      | 12 +++++++
 .../virtgpu-forward-backend.cpp               | 15 +++++++--
 .../ggml-remotingfrontend/virtgpu-forward.h   |  2 ++
 7 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index f34a5b8c4d645..cf416156c483b 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -6,11 +6,15 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
+#include "shared/apir_backend.h"
+
 uint32_t
 backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   UNUSED(enc);
 
+  start_timer();
+
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
 
@@ -30,5 +34,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
 
   vn_encode_ggml_status(enc, &status);
 
+  stop_timer();
+
   return 0;
 }
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index 6781e108200c2..6038698fa9c05 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -12,6 +12,10 @@ ggml_backend_reg_t reg = NULL;
 ggml_backend_dev_t dev = NULL;
 ggml_backend_t bck = NULL;
 
+long long timer_start = 0;
+long long timer_total = 0;
+long long timer_count = 0;
+
 uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) {
   if (reg != NULL) {
     FATAL("%s: already initialized :/", __func__);
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 22a60681d4447..6eab34acfccdc 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -28,6 +28,10 @@ extern "C" {
     dev->iface.get_memory(dev, &free, &total);
     WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
 
+    show_timer();
+
+    /* *** */
+
     if (backend_library_handle) {
       INFO("%s: The GGML backend library was loaded. Unloading it.", __func__);
       dlclose(backend_library_handle);
@@ -91,6 +95,11 @@ extern "C" {
       return APIR_BACKEND_FORWARD_INDEX_INVALID;
     }
 
+#if 0
+    static long long count = 0;
+    INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type));
+    count += 1;
+#endif
     backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type];
     uint32_t ret = forward_fct(enc, dec, ctx);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 6449ccc109146..1e9c2e9356936 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -82,7 +82,30 @@ struct virgl_apir_context {
   struct virgl_apir_callbacks iface;
 };
 
-#define TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP 2
-#define TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR 2
-#define TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR 2
-#define TENSOR_MAX_DEPTH_CGRAPH_DATA 10
+extern long long timer_start;
+extern long long timer_total;
+extern long long timer_count;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static inline void show_timer(void) {
+  long long ms = timer_total/1000000;
+  long long itl = ms/timer_count;
+  float speed = 1/((float)itl) * 1000;
+
+  INFO("compute_graph: [%9ld] ms for %ld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
+  INFO("compute_graph: [%9ld] s", (ms)/1000);
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 055c9b0e10dbb..4b9888ca66386 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -111,6 +111,15 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
   /* .get_proc_address = */ NULL,
 };
 
+long long timer_start = 0;
+long long timer_total = 0;
+long long timer_count = 0;
+
+// needed because `show_timer` is inline
+static void showTime() {
+  show_timer();
+}
+
 ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
   struct virtgpu *gpu = apir_initialize();
   if (!gpu) {
@@ -128,5 +137,8 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
 
   ggml_backend_remoting_reg_init_devices(&reg);
 
+  int cr = atexit(showTime);
+  assert(cr == 0);
+
   return &reg;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index 61c7fc7ac9839..e467bcd722d0a 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -1,9 +1,16 @@
 #include "virtgpu-forward-impl.h"
 
+static long long current_time_ms() {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
 ggml_status
 apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
-  UNUSED(cgraph);
-
+  
+  start_timer();
+  
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
@@ -44,5 +51,9 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
   if (shmem != gpu->data_shmem) {
     virtgpu_shmem_destroy(gpu, shmem->shmem);
   }
+  
+  stop_timer();
+  
   return status;
 }
+
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 1f03f8bf31725..239295aa3ac78 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -2,6 +2,8 @@
 #include "ggml-impl.h"
 #include "ggml-alloc.h"
 
+#include "virtgpu-utils.h"
+
 #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
 
 /* device */

From 55ce372d2a167125ac4173ce1aaa31d97052b62a Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:09:24 +0200
Subject: [PATCH 088/117] remoting: cleanup the logs

---
 .../backend-dispatched-buffer-type.cpp                    | 5 +----
 .../ggml-remotingbackend/backend-dispatched-buffer.cpp    | 2 --
 ggml/src/ggml-remotingbackend/backend.cpp                 | 8 ++++++++
 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp       | 8 ++++++++
 ggml/src/ggml-remotingfrontend/ggml-backend.cpp           | 3 ---
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 9ff2e79831f87..405685b91527f 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -77,17 +77,14 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder
 
   ggml_backend_buffer_t buffer;
 #if APIR_ALLOC_FROM_HOST_PTR
-  WARNING("USING FROM_HOST_PTR\n\n");
   #define MAX_TENSOR_SIZE 323205120
   buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE);
 
   vn_encode_ggml_buffer_type(enc, buffer->buft);
 #else
-  WARNING("USING ALLOC_BUFFER");
   buffer = buft->iface.alloc_buffer(buft, size);
-  WARNING("USING ALLOC_BUFFER--> %p", buffer);
 #endif
-  
+
   vn_encode_ggml_buffer(enc, buffer);
 
   if (buffer) {
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index ea9f31ad1a634..b755e9a946fa9 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -15,8 +15,6 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, st
   uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer);
   vn_encode_uintptr_t(enc, &base);
 
-  //INFO("%s: send base %p\n", __func__,  (void *) base);
-
   return 0;
 }
 
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 6eab34acfccdc..5ec77d96257d1 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -69,6 +69,14 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
+    INFO("#");
+#if APIR_ALLOC_FROM_HOST_PTR
+    INFO("# USING ALLOC_FROM_HOST_PTR");
+#else
+    INFO("# USING ALLOC_BUFFER");
+#endif
+    INFO("#");
+
     return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
   }
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 4b9888ca66386..6dd8ad8919b94 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -135,6 +135,14 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
 
   INFO("ggml_backend_remoting_frontend_reg() hello :wave:");
 
+  INFO("#");
+#if APIR_ALLOC_FROM_HOST_PTR
+  INFO("# USING ALLOC_FROM_HOST_PTR");
+#else
+  INFO("# USING ALLOC_BUFFER");
+#endif
+  INFO("#");
+
   ggml_backend_remoting_reg_init_devices(&reg);
 
   int cr = atexit(showTime);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index e4be758af84b3..00144d0ed166c 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -15,9 +15,6 @@ static void ggml_backend_remoting_free(ggml_backend_t backend) {
 }
 
 static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-  UNUSED(backend);
-  UNUSED(cgraph);
-
   struct virtgpu *gpu = DEV_TO_GPU(backend->device);
 
   IMPLEMENTED_ONCE;

From 559626523fe7ef7760ddeae75baa964837f1a84b Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:09:40 +0200
Subject: [PATCH 089/117] ggml: src: ggml-remotingfrontend/ggml-backend-reg:
 call the initialization functions only once

---
 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 6dd8ad8919b94..6d6896b063048 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -133,6 +133,12 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
     /* .context     = */ gpu,
   };
 
+  static bool initialized = false;
+  if (initialized) {
+    return &reg;
+  }
+  initialized = true;
+
   INFO("ggml_backend_remoting_frontend_reg() hello :wave:");
 
   INFO("#");

From 4f9a2d48a4f65f94c668341b18c98b419788bb54 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 15:10:27 +0200
Subject: [PATCH 090/117] disable APIR_ALLOC_FROM_HOST_PTR

---
 ggml/src/ggml-remotingbackend/shared/apir_backend.h       | 2 +-
 ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 1e9c2e9356936..0a627ea63b74d 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,7 +14,7 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
-#define APIR_ALLOC_FROM_HOST_PTR 1
+#define APIR_ALLOC_FROM_HOST_PTR 0
 
 typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index 18b010583fa6d..dd3f7a5cc0bc5 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -64,7 +64,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_contex
   return;
 }
 
-#if APIR_ALLOC_FROM_HOST_PTR
+#if false
 void
 apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context,
 		       const ggml_tensor *tensor, void *data, size_t offset, size_t size) {

From 4fa0b0a1398f50acaf21bd98f0612c719d07d006 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 27 May 2025 16:19:49 +0200
Subject: [PATCH 091/117] remoting: cache the buffer_get_base result

---
 .../ggml-backend-buffer-type.cpp                     |  1 +
 .../ggml-remotingfrontend/ggml-backend-buffer.cpp    | 12 +++++++++---
 ggml/src/ggml-remotingfrontend/ggml-remoting.h       |  2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 775238d501374..880f982c6c961 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -15,6 +15,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
 
   context->gpu = gpu;
   context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+  context->base = NULL;
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 99bdbdaca2275..f3f47b325f14a 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -4,11 +4,17 @@
   ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
 
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
-  //IMPLEMENTED;
+  IMPLEMENTED_ONCE;
 
-  struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context;
+  if (context->base) {
+    return context->base;
+  }
+
+  context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer),
+				       BUFFER_TO_APIR_CONTEXT(buffer));
 
-  return apir_buffer_get_base(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+  return context->base;
 }
 
 static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 71708e75a5d4a..05797775cf081 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -88,6 +88,8 @@ struct ggml_backend_remoting_buffer_context {
   apir_buffer_context_t apir_context;
 
   struct virtgpu *gpu;
+
+  void *base;
 };
 
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;

From 609c74391068401aa10835cd4a85ec6be61b9e00 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 May 2025 14:05:44 +0200
Subject: [PATCH 092/117] examples: run: run: improve the timing measurement

---
 examples/run/run.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 42db6ef659980..4bd97da0f1d0d 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -985,13 +985,11 @@ static inline void stop_timer(void) {
 }
 
 static void show_timer(void) {
-  //printe("[%15lld] ns\n", timer_total);
-  long long ms = timer_total/1000000;
-  long long itl = ms/timer_count;
-  float speed = 1/((float)itl) * 1000;
-  printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
 
-  printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000);
+  printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed);
 }
 
 

From 6ea6a298c676221a26b9a73df4572e944ad7cb4c Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 May 2025 14:06:07 +0200
Subject: [PATCH 093/117] examples: run: run: remove the stop after 25 tokens

---
 examples/run/run.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 4bd97da0f1d0d..2107b37af476f 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -1009,14 +1009,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
     llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
     llama_token new_token_id;
 
-    int count = 0;
     while (true) {
-#if 0
-      if (count > 25) {
-	printe("WARNING: stopping after %d tokens", count);
-	break;
-      }
-#endif
         start_timer();
         check_context_size(llama_data.context, batch);
         if (llama_decode(llama_data.context.get(), batch)) {
@@ -1040,7 +1033,6 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
         // prepare the next batch with the sampled token
         batch = llama_batch_get_one(&new_token_id, 1);
 	stop_timer();
-	count += 1;
     }
 
     printf(LOG_COL_DEFAULT);

From aac3ca898e94dcd50781398dfd84452c02f24b68 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 May 2025 14:06:53 +0200
Subject: [PATCH 094/117] remoting: improve the timing measurement

---
 ggml/src/ggml-metal/ggml-metal.m              | 46 +++++++++++++++++++
 .../backend-dispatched-backend.cpp            |  6 ++-
 .../backend-dispatched-buffer.cpp             | 11 +++++
 .../backend-dispatched.cpp                    |  4 +-
 ggml/src/ggml-remotingbackend/backend.cpp     |  7 +--
 .../shared/apir_backend.h                     | 35 ++++++++------
 .../ggml-backend-buffer.cpp                   | 12 +++++
 .../ggml-backend-reg.cpp                      |  8 ++--
 .../ggml-remotingfrontend/ggml-backend.cpp    | 10 +++-
 .../virtgpu-forward-backend.cpp               |  8 +---
 10 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 97f426cbd3e13..777c868e949aa 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -4485,9 +4485,53 @@ static void ggml_metal_encode_node(
     }
 }
 
+long long timer_start;
+long long timer_total;
+long long timer_count;
+
+static inline void start_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+}
+
+static inline void stop_timer(void) {
+  struct timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
+  long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+
+  timer_total += (timer_end - timer_start);
+  timer_count += 1;
+}
+
+static void show_timer(void) {
+  double ms = timer_total/1000000;
+  double itl = ms/timer_count;
+  double speed = 1/itl * 1000;
+
+  printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed);
+
+  timer_start = 0;
+  timer_total = 1; // to avoid re-registering
+  timer_count = 0;
+}
+
+static void show_timer_signal(int sig) {
+  GGML_UNUSED(sig);
+  show_timer();
+}
+
 static enum ggml_status ggml_metal_graph_compute(
             ggml_backend_t   backend,
         struct ggml_cgraph * gf) {
+
+  if (timer_total == 0) {
+    signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid)
+    atexit(show_timer);
+  }
+
+  start_timer();
+
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -4615,6 +4659,8 @@ static enum ggml_status ggml_metal_graph_compute(
         }
     }
 
+  stop_timer();
+
     return GGML_STATUS_SUCCESS;
 }
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index cf416156c483b..6e600843a48db 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -8,12 +8,14 @@
 
 #include "shared/apir_backend.h"
 
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
 uint32_t
 backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
   UNUSED(enc);
 
-  start_timer();
+  start_timer(&graph_compute_timer);
 
   uint32_t shmem_res_id;
   vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
@@ -34,7 +36,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
 
   vn_encode_ggml_status(enc, &status);
 
-  stop_timer();
+  stop_timer(&graph_compute_timer);
 
   return 0;
 }
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
index b755e9a946fa9..fc1ccaef6748d 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp
@@ -6,6 +6,9 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
 uint32_t
 backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
@@ -23,6 +26,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   UNUSED(ctx);
   UNUSED(enc);
 
+  start_timer(&set_tensor_timer);
+
   ggml_backend_buffer_t buffer;
   buffer = vn_decode_ggml_buffer(dec);
 
@@ -60,6 +65,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
 
   buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size);
 
+  stop_timer(&set_tensor_timer);
+
   return 0;
 }
 
@@ -68,6 +75,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   UNUSED(ctx);
   UNUSED(enc);
 
+  start_timer(&get_tensor_timer);
+
   ggml_backend_buffer_t buffer;
   buffer = vn_decode_ggml_buffer(dec);
 
@@ -94,6 +103,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec,
   UNUSED(tensor);
   buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size);
 
+  stop_timer(&get_tensor_timer);
+
   return 0;
 }
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
index 6038698fa9c05..d90424a3d714f 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp
@@ -31,9 +31,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba
     dev = reg->iface.get_device(reg, 0);
   }
 
-  ggml_backend_t (* ggml_backend_fct)(void) = (ggml_backend_t (*)()) ggml_backend_init_fct_p;
+  ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p;
 
-  bck = ggml_backend_fct();
+  bck = ggml_backend_fct(0);
   if (!bck) {
     ERROR("%s: backend initialization failed :/", __func__);
     return APIR_BACKEND_INITIALIZE_BACKEND_FAILED;
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 5ec77d96257d1..5bc6c923f405a 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -28,8 +28,9 @@ extern "C" {
     dev->iface.get_memory(dev, &free, &total);
     WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024);
 
-    show_timer();
-
+    show_timer(&graph_compute_timer);
+    show_timer(&set_tensor_timer);
+    show_timer(&get_tensor_timer);
     /* *** */
 
     if (backend_library_handle) {
@@ -43,7 +44,7 @@ extern "C" {
   uint32_t apir_backend_initialize() {
     const char* dlsym_error;
 
-    INFO("%s: hello :wave: \\o/", __func__);
+    INFO("%s: hello " GGML_BACKEND_REG_FCT_NAME " :wave: \\o/", __func__);
 
     backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 0a627ea63b74d..ad1747b17d182 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -82,30 +82,37 @@ struct virgl_apir_context {
   struct virgl_apir_callbacks iface;
 };
 
-extern long long timer_start;
-extern long long timer_total;
-extern long long timer_count;
+struct timer_data {
+  long long start;
+  long long total;
+  long long count;
+  const char *name;
+};
+
+extern struct timer_data graph_compute_timer;
+extern struct timer_data get_tensor_timer;
+extern struct timer_data set_tensor_timer;
 
-static inline void start_timer(void) {
+static inline void start_timer(struct timer_data *timer) {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
-  timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
+  timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
 }
 
-static inline void stop_timer(void) {
+static inline void stop_timer(struct timer_data *timer) {
   struct timespec ts;
   clock_gettime(CLOCK_REALTIME, &ts);  // Use CLOCK_MONOTONIC for elapsed time
   long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec;
 
-  timer_total += (timer_end - timer_start);
-  timer_count += 1;
+  timer->total += (timer_end - timer->start);
+  timer->count += 1;
 }
 
-static inline void show_timer(void) {
-  long long ms = timer_total/1000000;
-  long long itl = ms/timer_count;
-  float speed = 1/((float)itl) * 1000;
+static inline void show_timer(struct timer_data *timer) {
+  double ms = timer->total/1000000;
+  double itl = ms/timer->count;
+  double speed = 1/itl * 1000;
 
-  INFO("compute_graph: [%9ld] ms for %ld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed);
-  INFO("compute_graph: [%9ld] s", (ms)/1000);
+  INFO("%14s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s",
+       timer->name, ms, timer->count, itl, speed);
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index f3f47b325f14a..d056249bdf681 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -3,6 +3,9 @@
 #define BUFFER_TO_GPU(name) \
   ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu
 
+struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
+struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
+
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   IMPLEMENTED_ONCE;
 
@@ -32,6 +35,8 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf
 static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
   IMPLEMENTED_ONCE;
 
+  start_timer(&set_tensor_timer);
+
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 #if 0
   INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size);
@@ -46,14 +51,21 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer
 #endif
   apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
 
+  stop_timer(&set_tensor_timer);
+
   return;
 }
 
 static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
   IMPLEMENTED_ONCE;
+
+  start_timer(&get_tensor_timer);
+
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
   apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+
+  stop_timer(&get_tensor_timer);
 }
 
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index 6d6896b063048..e9b22071af224 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -111,13 +111,11 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = {
   /* .get_proc_address = */ NULL,
 };
 
-long long timer_start = 0;
-long long timer_total = 0;
-long long timer_count = 0;
 
-// needed because `show_timer` is inline
 static void showTime() {
-  show_timer();
+  show_timer(&graph_compute_timer);
+  show_timer(&get_tensor_timer);
+  show_timer(&set_tensor_timer);
 }
 
 ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
index 00144d0ed166c..14f95ec88ff02 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp
@@ -14,12 +14,20 @@ static void ggml_backend_remoting_free(ggml_backend_t backend) {
   delete backend;
 }
 
+struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"};
+
 static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
   struct virtgpu *gpu = DEV_TO_GPU(backend->device);
 
   IMPLEMENTED_ONCE;
 
-  return apir_backend_graph_compute(gpu, cgraph);
+  start_timer(&graph_compute_timer);
+
+  ggml_status status = apir_backend_graph_compute(gpu, cgraph);
+
+  stop_timer(&graph_compute_timer);
+
+  return status;
 }
 
 static ggml_backend_i ggml_backend_remoting_interface = {
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
index e467bcd722d0a..82b51838997c6 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp
@@ -8,9 +8,6 @@ static long long current_time_ms() {
 
 ggml_status
 apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
-  
-  start_timer();
-  
   struct vn_cs_encoder *encoder;
   struct vn_cs_decoder *decoder;
 
@@ -51,9 +48,6 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) {
   if (shmem != gpu->data_shmem) {
     virtgpu_shmem_destroy(gpu, shmem->shmem);
   }
-  
-  stop_timer();
-  
+
   return status;
 }
-

From b4837da71785859eb1390da7a5bde235cb7325c1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 May 2025 14:07:08 +0200
Subject: [PATCH 095/117] remoting: allow compiling to Vulkan

---
 ggml/src/ggml-remotingbackend/backend.cpp |  8 ++++++++
 prepare.backend.sh                        | 11 ++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index 5bc6c923f405a..fa2344ea8f676 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -10,9 +10,17 @@
 #include "shared/apir_backend.h"
 #include "shared/venus_cs.h"
 
+#define USE_METAL 1
+
+#if USE_METAL
 #define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
 #define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg"
 #define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init"
+#else
+#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-vulkan.dylib"
+#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_vk_reg"
+#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_vk_init"
+#endif
 
 static void *backend_library_handle = NULL;
 
diff --git a/prepare.backend.sh b/prepare.backend.sh
index 76e30fe31cfa4..caed8223382e9 100755
--- a/prepare.backend.sh
+++ b/prepare.backend.sh
@@ -1,6 +1,15 @@
-cmake -S . -B ../build.remoting-backend-prod \
+if [[ "${PERF_MODE:-}" ]]; then
+    FLAVOR="-prod"
+else
+    FLAVOR=""
+fi
+
+cmake -S . -B ../build.remoting-backend$FLAVOR \
       -DGGML_REMOTINGBACKEND=ON \
       -DGGML_NATIVE=OFF \
+      -DGGML_METAL=ON \
+      -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \
       "$@"
 
 #      -DCMAKE_BUILD_TYPE=Debug \
+#

From ecb7a235736c878a5f0a3642139ba1d9ff532cc9 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 28 May 2025 14:07:23 +0200
Subject: [PATCH 096/117] ggml: src: ggml-remotingfrontend/virtgpu: reduce the
 response time wait delay

---
 ggml/src/ggml-remotingfrontend/virtgpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
index 39ed7b3a99f95..66bbf17ac6d63 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp
@@ -481,7 +481,7 @@ remote_call(
    */
 
   while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) {
-    int64_t base_sleep_us = 160;
+    int64_t base_sleep_us = 15;
 
     os_time_sleep(base_sleep_us);
   }

From 3f3624411ac1c50172bf87ddc08afd27664b7e0c Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 2 Jun 2025 14:03:23 +0200
Subject: [PATCH 097/117] remoting: experiement with buffer_from_ptr

---
 .../backend-dispatched-buffer-type.cpp        | 16 +----
 .../backend-dispatched-device.cpp             | 31 +++++++++
 .../ggml-remotingbackend/backend-dispatched.h |  3 +
 ggml/src/ggml-remotingbackend/backend.cpp     |  8 ---
 .../shared/apir_backend.h                     | 30 ++++-----
 .../venus_cs_ggml-rpc-back.cpp                | 11 +++-
 .../ggml-backend-buffer-type.cpp              | 14 ++++
 .../ggml-backend-buffer.cpp                   | 44 +++++++++++++
 .../ggml-backend-device.cpp                   | 66 ++++++++++++++-----
 .../ggml-backend-reg.cpp                      |  8 ---
 .../src/ggml-remotingfrontend/ggml-remoting.h |  7 ++
 .../venus_cs_ggml-rpc-front.cpp               |  2 +
 .../virtgpu-forward-buffer-type.cpp           | 19 +-----
 .../virtgpu-forward-buffer.cpp                |  4 +-
 .../virtgpu-forward-device.cpp                | 37 +++++++++++
 .../ggml-remotingfrontend/virtgpu-forward.h   |  4 +-
 16 files changed, 215 insertions(+), 89 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
index 405685b91527f..f925d1e066fc0 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp
@@ -60,30 +60,16 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec
 uint32_t
 backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
-#if APIR_ALLOC_FROM_HOST_PTR
-  uint32_t shmem_res_id;
-  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
 
-  void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
-  if (!shmem_data) {
-    FATAL("Couldn't get the shmem addr from virgl :/");
-  }
-#else
   ggml_backend_buffer_type_t buft;
   buft = vn_decode_ggml_buffer_type(dec);
-#endif
+
   size_t size;
   vn_decode_size_t(dec, &size);
 
   ggml_backend_buffer_t buffer;
-#if APIR_ALLOC_FROM_HOST_PTR
-  #define MAX_TENSOR_SIZE 323205120
-  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE);
 
-  vn_encode_ggml_buffer_type(enc, buffer->buft);
-#else
   buffer = buft->iface.alloc_buffer(buft, size);
-#endif
 
   vn_encode_ggml_buffer(enc, buffer);
 
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 18f0e0a81b6a6..5bf0788ccf864 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -109,3 +109,34 @@ backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, s
 
   return 0;
 }
+
+uint32_t
+backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
+  UNUSED(ctx);
+  UNUSED(dec);
+
+  uint32_t shmem_res_id;
+  vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id);
+
+  void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id);
+  if (!shmem_ptr) {
+    FATAL("Couldn't get the shmem addr from virgl :/");
+  }
+
+  size_t size;
+  vn_decode_size_t(dec, &size);
+  size_t max_tensor_size;
+  vn_decode_size_t(dec, &max_tensor_size);
+
+  ggml_backend_buffer_t buffer;
+  buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
+
+  vn_encode_ggml_buffer(enc, buffer);
+  vn_encode_ggml_buffer_type(enc, buffer->buft);
+
+  if (buffer) {
+    track_backend_buffer(buffer);
+  }
+
+  return 0;
+}
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h
index d8d86fc3f67f5..3c164b532ac95 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched.h
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h
@@ -27,6 +27,7 @@ uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decod
 uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
+uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
 
 /* buffer-type */
 uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx);
@@ -57,6 +58,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t
   case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op";
   case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type";
   case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props";
+  case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr";
 
   /* buffer-type */
   case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name";
@@ -88,6 +90,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC
   [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op,
   [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type,
   [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props,
+  [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr,
 
   /* buffer-type */
   [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name,
diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index fa2344ea8f676..f5a10c234644a 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -78,14 +78,6 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
-    INFO("#");
-#if APIR_ALLOC_FROM_HOST_PTR
-    INFO("# USING ALLOC_FROM_HOST_PTR");
-#else
-    INFO("# USING ALLOC_BUFFER");
-#endif
-    INFO("#");
-
     return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct);
   }
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index ad1747b17d182..efd0803a929d5 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -14,17 +14,14 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
-#define APIR_ALLOC_FROM_HOST_PTR 0
-
 typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
 
 typedef struct {
   apir_buffer_host_handle_t host_handle;
-#if APIR_ALLOC_FROM_HOST_PTR
+
   struct vn_renderer_shmem *shmem;
   apir_buffer_type_host_handle_t buft_host_handle;
-#endif
 } apir_buffer_context_t;
 
 typedef uint32_t (*apir_backend_initialize_t)(void);
@@ -49,26 +46,27 @@ typedef enum ApirBackendCommandType {
   APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5,
   APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6,
   APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7,
+  APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8,
 
   /* buffer-type */
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 8,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11,
-  APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12,
+  APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13,
 
   /* buffer */
-  APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13,
-  APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14,
-  APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15,
-  APIR_COMMAND_TYPE_BUFFER_CLEAR = 16,
-  APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 17,
+  APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14,
+  APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15,
+  APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16,
+  APIR_COMMAND_TYPE_BUFFER_CLEAR = 17,
+  APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18,
 
   /* backend */
-  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 18,
+  APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19,
 
   // last command_type index + 1
-  APIR_BACKEND_DISPATCH_TABLE_COUNT = 19,
+  APIR_BACKEND_DISPATCH_TABLE_COUNT = 20,
 } ApirBackendCommandType;
 
 
diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
index 58a142ae93d5b..30ae511aa95e8 100644
--- a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
+++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp
@@ -43,13 +43,18 @@ deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
     result->buffer = nullptr;
   }
 
+  uint64_t tensor_data = tensor->data;
   if (result->buffer) {
     // require that the tensor data does not go beyond the buffer end
     uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
     uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
     uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
-    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
-    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+
+    // tensor->data is serialized as an offset to the buffer base address
+    tensor_data += buffer_start;
+
+    GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow
+    GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size);
   }
 
   result->op = (ggml_op) tensor->op;
@@ -57,7 +62,7 @@ deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
     result->op_params[i] = tensor->op_params[i];
   }
   result->flags = tensor->flags;
-  result->data = reinterpret_cast<void *>(tensor->data);
+  result->data = reinterpret_cast<void *>(tensor_data);
   ggml_set_name(result, tensor->name);
   return result;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 880f982c6c961..5e67e82874e4d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -16,8 +16,13 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   context->gpu = gpu;
   context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
   context->base = NULL;
+  context->is_from_ptr = false;
+
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+  INFO("##");
+  INFO("## %s(%llx) --> %p", __func__, size, buffer);
+  INFO("##\n");
 
   return buffer;
 }
@@ -65,4 +70,13 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
   /* .is_host          = */ NULL,
 };
 
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ NULL,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
 /****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index d056249bdf681..67dd06843495d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -6,6 +6,9 @@
 struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
 struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
 
+struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"};
+struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"};
+
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   IMPLEMENTED_ONCE;
 
@@ -68,6 +71,31 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
   stop_timer(&get_tensor_timer);
 }
 
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_from_ptr_timer);
+
+  UNUSED(buffer);
+
+  memcpy((char *)tensor->data + offset, data, size);
+
+  stop_timer(&set_tensor_from_ptr_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  UNUSED(buffer);
+
+  start_timer(&get_tensor_from_ptr_timer);
+
+  memcpy(data, (const char *)tensor->data + offset, size);
+
+  stop_timer(&get_tensor_from_ptr_timer);
+}
 
 static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
   NOT_IMPLEMENTED;
@@ -99,6 +127,10 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
 
   apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer));
+
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  free(context);
+  buffer->context = NULL;
 }
 
 const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
@@ -112,3 +144,15 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
   /* .clear           = */ ggml_backend_remoting_buffer_clear,
   /* .reset           = */ NULL,
 };
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 092c05b9e43f3..bc40f9dbb2238 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -103,7 +103,7 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
   // the API Remoting frontend
   props->caps.async = false;
   props->caps.host_buffer = false;
-  props->caps.buffer_from_host_ptr = false;
+  props->caps.buffer_from_host_ptr = true;
   props->caps.events = false;
 #endif
 
@@ -129,29 +129,59 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
   return &buft;
 }
 
-static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-  UNUSED(dev);
-  UNUSED(ptr);
-  UNUSED(size);
-  UNUSED(max_tensor_size);
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
 
-  NOT_IMPLEMENTED;
-  STOP_HERE;
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
-  return nullptr;
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+static ggml_backend_buffer_t
+ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
 
-    static struct ggml_backend_buffer_type host_bufft = {
-      /* .iface    = */ ggml_backend_remoting_host_buffer_type_interface,
-      /* .device   = */ dev,
-      /* .context  = */ nullptr,
-    };
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
 
-    //IMPLEMENTED;
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  UNUSED(ptr);
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+  context->base = ptr;
+  context->is_from_ptr = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+  INFO("#");
+  INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer);
+  INFO("#\n");
+
+  return buffer;
+}
+
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  static struct ggml_backend_buffer_type host_bufft = {
+    /* .iface    = */ ggml_backend_remoting_host_buffer_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ nullptr,
+  };
 
-    return &host_bufft;
+  return &host_bufft;
 }
 
 const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
@@ -163,7 +193,7 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
   /* .get_host_buffer_type = */ NULL,
-  /* .buffer_from_host_ptr = */ NULL,
+  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
   /* .offload_op           = */ ggml_backend_remoting_device_offload_op,
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
index e9b22071af224..d0132370d9f91 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp
@@ -139,14 +139,6 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() {
 
   INFO("ggml_backend_remoting_frontend_reg() hello :wave:");
 
-  INFO("#");
-#if APIR_ALLOC_FROM_HOST_PTR
-  INFO("# USING ALLOC_FROM_HOST_PTR");
-#else
-  INFO("# USING ALLOC_BUFFER");
-#endif
-  INFO("#");
-
   ggml_backend_remoting_reg_init_devices(&reg);
 
   int cr = atexit(showTime);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 05797775cf081..4da3b9432f1f8 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -13,6 +13,9 @@
 #define DEV_TO_GPU(name) \
   ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu
 
+#define BUFFER_TO_GGML_CONTEXT(name) \
+  ((struct ggml_backend_remoting_buffer_context *) (name)->context)
+
 #define BUFFER_TO_APIR_CONTEXT(name) \
   &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context
 
@@ -90,12 +93,16 @@ struct ggml_backend_remoting_buffer_context {
   struct virtgpu *gpu;
 
   void *base;
+
+  bool is_from_ptr;
 };
 
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface;
 extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface;
 extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface;
 extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface;
+extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface;
+extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface;
 
 ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device);
 ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type();
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
index bc4b96b84f365..67b8c37748aa8 100644
--- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -40,6 +40,8 @@ serialize_tensor(const ggml_tensor * tensor) {
   result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
   result.view_offs = tensor->view_offs;
   result.data = reinterpret_cast<uint64_t>(tensor->data);
+  // tensor->data is serialized as an offset to the buffer base address
+  result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
   snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
   return result;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
index 4f7aac1360124..e991c0bef324d 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp
@@ -100,29 +100,14 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu
 
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER);
 
-#if APIR_ALLOC_FROM_HOST_PTR
-  UNUSED(buft);
-  
-  buffer_context.shmem = virtgpu_shmem_create(gpu, size);
-  //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024);
-  if (!buffer_context.shmem) {
-    FATAL("Couldn't allocate the guest-host shared buffer :/");
-  }
-
-  vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
-#else
   vn_encode_ggml_buffer_type(encoder, buft);
-#endif
+
   vn_encode_size_t(encoder, &size);
 
   REMOTE_CALL(gpu, encoder, decoder);
 
-#if APIR_ALLOC_FROM_HOST_PTR
-  buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
-#endif
-
   vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
-  
+
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 
   return buffer_context;
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
index dd3f7a5cc0bc5..04041ab5feb37 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp
@@ -140,8 +140,6 @@ apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_conte
   vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle);
 
   REMOTE_CALL(gpu, encoder, decoder);
-#if APIR_ALLOC_FROM_HOST_PTR
-  virtgpu_shmem_destroy(gpu, buffer_context->shmem->shmem);
-#endif
+
   REMOTE_CALL_FINISH(gpu, encoder, decoder);
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index ffc6febf4cab0..9a2b6d7c501b4 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -200,3 +200,40 @@ apir_device_get_props(struct virtgpu *gpu,
 
   return;
 }
+
+apir_buffer_context_t
+apir_device_buffer_from_ptr(struct virtgpu *gpu,
+			    size_t size,
+			    size_t max_tensor_size) {
+  struct vn_cs_encoder *encoder;
+  struct vn_cs_decoder *decoder;
+  apir_buffer_context_t buffer_context;
+
+  BEING_IMPLEMENTED;
+
+  REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
+
+  /* *** */
+
+  buffer_context.shmem = virtgpu_shmem_create(gpu, size);
+  if (!buffer_context.shmem) {
+    FATAL("Couldn't allocate the guest-host shared buffer :/");
+  }
+
+  vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id);
+
+  vn_encode_size_t(encoder, &size);
+  vn_encode_size_t(encoder, &max_tensor_size);
+
+  REMOTE_CALL(gpu, encoder, decoder);
+
+  vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
+
+  buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
+
+  /* *** */
+
+  REMOTE_CALL_FINISH(gpu, encoder, decoder);
+
+  return buffer_context;
+}
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index 239295aa3ac78..bbe94f14300ef 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -19,7 +19,9 @@ void apir_device_get_props(struct virtgpu *gpu,
 			   bool *host_buffer,
 			   bool *buffer_from_host_ptr,
 			   bool *events);
-
+apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu,
+						  size_t size,
+						  size_t max_tensor_size);
 /* buffer-type */
 const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);
 size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft);

From b20672bfd6d59168e0d0935e87b01868adee626d Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 2 Jun 2025 14:53:25 +0200
Subject: [PATCH 098/117] remoting: remove from_ptr code

---
 .../ggml-backend-buffer-type.cpp              | 11 ----
 .../ggml-backend-buffer.cpp                   | 41 ---------------
 .../ggml-backend-device.cpp                   | 52 ++-----------------
 3 files changed, 5 insertions(+), 99 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 5e67e82874e4d..1cef55b620811 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -16,8 +16,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   context->gpu = gpu;
   context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
   context->base = NULL;
-  context->is_from_ptr = false;
-
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
   INFO("##");
@@ -70,13 +68,4 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
   /* .is_host          = */ NULL,
 };
 
-const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
-  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
-  /* .alloc_buffer     = */ NULL,
-  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
-  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
-  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-  /* .is_host          = */ NULL,
-};
-
 /****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 67dd06843495d..d35d5c9b66cd7 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -6,9 +6,6 @@
 struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
 struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
 
-struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"};
-struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"};
-
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   IMPLEMENTED_ONCE;
 
@@ -71,32 +68,6 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
   stop_timer(&get_tensor_timer);
 }
 
-static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-  IMPLEMENTED_ONCE;
-
-  start_timer(&set_tensor_from_ptr_timer);
-
-  UNUSED(buffer);
-
-  memcpy((char *)tensor->data + offset, data, size);
-
-  stop_timer(&set_tensor_from_ptr_timer);
-
-  return;
-}
-
-static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-  IMPLEMENTED_ONCE;
-
-  UNUSED(buffer);
-
-  start_timer(&get_tensor_from_ptr_timer);
-
-  memcpy(data, (const char *)tensor->data + offset, size);
-
-  stop_timer(&get_tensor_from_ptr_timer);
-}
-
 static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
   NOT_IMPLEMENTED;
 
@@ -144,15 +115,3 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
   /* .clear           = */ ggml_backend_remoting_buffer_clear,
   /* .reset           = */ NULL,
 };
-
-const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
-  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
-  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
-  /* .init_tensor     = */ NULL,
-  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
-  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
-  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
-  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
-  /* .clear           = */ ggml_backend_remoting_buffer_clear,
-  /* .reset           = */ NULL,
-};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index bc40f9dbb2238..190d0d77d6551 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -102,12 +102,12 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
   // ignore the actual backend answers and set it as we provide it in
   // the API Remoting frontend
   props->caps.async = false;
-  props->caps.host_buffer = false;
-  props->caps.buffer_from_host_ptr = true;
+  props->caps.host_buffer = true;
+  props->caps.buffer_from_host_ptr = false;
   props->caps.events = false;
 #endif
 
-  INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
+  INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d",
     __func__, props->caps.async, props->caps.host_buffer,
        props->caps.buffer_from_host_ptr, props->caps.events);
 }
@@ -129,48 +129,6 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
   return &buft;
 }
 
-static ggml_backend_buffer_type_t
-ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
-  IMPLEMENTED_ONCE;
-
-  struct virtgpu *gpu = DEV_TO_GPU(dev);
-
-  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
-
-  static struct ggml_backend_buffer_type buft {
-    /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
-    /* .device   = */ dev,
-    /* .context  = */ (void *) ctx,
-  };
-
-  return &buft;
-}
-
-static ggml_backend_buffer_t
-ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
-
-  struct virtgpu *gpu = DEV_TO_GPU(dev);
-
-  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
-  if (!context) {
-    FATAL("Couldn't allocate the buffer context ...");
-  }
-
-  UNUSED(ptr);
-  context->gpu = gpu;
-  context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
-  context->base = ptr;
-  context->is_from_ptr = true;
-
-  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
-
-  INFO("#");
-  INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer);
-  INFO("#\n");
-
-  return buffer;
-}
-
 static ggml_backend_buffer_type_t
 ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
   IMPLEMENTED_ONCE;
@@ -192,8 +150,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
   /* .get_props            = */ ggml_backend_remoting_device_get_props,
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-  /* .get_host_buffer_type = */ NULL,
-  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
+  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
+  /* .buffer_from_host_ptr = */ NULL,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
   /* .offload_op           = */ ggml_backend_remoting_device_offload_op,

From f0127dc942e23368e8d746fc6fcb2931a89c7263 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 2 Jun 2025 15:59:21 +0200
Subject: [PATCH 099/117] remoting: try host_pointer

---
 .../backend-dispatched-device.cpp             |  1 +
 .../ggml-backend-buffer-type.cpp              |  4 ++-
 .../ggml-backend-device.cpp                   |  4 +--
 .../ggml-backend-host-buffer-type.cpp         | 26 +++++++++----------
 .../src/ggml-remotingfrontend/ggml-remoting.h |  1 +
 .../virtgpu-forward-device.cpp                |  2 +-
 6 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 5bf0788ccf864..13d32194b1668 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -131,6 +131,7 @@ backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *
   ggml_backend_buffer_t buffer;
   buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
 
+  INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer, size);
   vn_encode_ggml_buffer(enc, buffer);
   vn_encode_ggml_buffer_type(enc, buffer->buft);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 1cef55b620811..d462b23a0ad85 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -16,10 +16,12 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   context->gpu = gpu;
   context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
   context->base = NULL;
+  context->is_host_buffer = false;
+  context->is_from_ptr = false;
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
   INFO("##");
-  INFO("## %s(%llx) --> %p", __func__, size, buffer);
+  INFO("## %s(%llx) --> %p <---------------", __func__, size, buffer);
   INFO("##\n");
 
   return buffer;
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 190d0d77d6551..07c65276146f1 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -102,8 +102,8 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
   // ignore the actual backend answers and set it as we provide it in
   // the API Remoting frontend
   props->caps.async = false;
-  props->caps.host_buffer = true;
-  props->caps.buffer_from_host_ptr = false;
+  props->caps.host_buffer = false;
+  props->caps.buffer_from_host_ptr = true;
   props->caps.events = false;
 #endif
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
index 20159faf3cae9..c09c80d6472f5 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp
@@ -42,25 +42,23 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 static ggml_backend_buffer_t
 ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
   IMPLEMENTED;
-  struct virtgpu *gpu = BUFT_TO_GPU(buft);
-
-  struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT();
 
-  size += 32;  // Behave like the CPU buffer type (dixit ggml-vulkan)
-
-  struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size);
+  struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  if (!shmem) {
-    FATAL("Couldn't allocate the guest-host shared host buffer :/");
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
   }
 
-  void *ptr = shmem->mmap_ptr;
-
-  device_ctx->shared_memory.push_back(std::make_tuple(ptr, size, shmem));
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+  context->base = context->apir_context.shmem->mmap_ptr;
+  context->is_host_buffer = true;
 
-  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-  buffer->buft = buft;
-  buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer;
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
+  INFO("##");
+  INFO("## %s(%llx) --> %p <======================", __func__, size, buffer);
+  INFO("##\n");
 
   return buffer;
 }
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 4da3b9432f1f8..18b880c740564 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -94,6 +94,7 @@ struct ggml_backend_remoting_buffer_context {
 
   void *base;
 
+  bool is_host_buffer;
   bool is_from_ptr;
 };
 
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 9a2b6d7c501b4..0d74b55c2083c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -228,7 +228,7 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu,
   REMOTE_CALL(gpu, encoder, decoder);
 
   vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
-
+  INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer_context.host_handle, size);
   buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
 
   /* *** */

From 11e2ebaf641b7870633af1ee246e3f6dbb5ec5a0 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 2 Jun 2025 16:07:11 +0200
Subject: [PATCH 100/117] remoting: try from_host_ptr

---
 .../ggml-backend-buffer-type.cpp              |  9 ++++
 .../ggml-backend-buffer.cpp                   | 41 ++++++++++++++++
 .../ggml-backend-device.cpp                   | 49 +++++++++++++++++--
 3 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index d462b23a0ad85..86ee8a8bf0f3b 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -70,4 +70,13 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = {
   /* .is_host          = */ NULL,
 };
 
+const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = {
+  /* .get_name         = */ ggml_backend_remoting_buffer_type_get_name,
+  /* .alloc_buffer     = */ NULL,
+  /* .get_alignment    = */ ggml_backend_remoting_buffer_type_get_alignment,
+  /* .get_max_size     = */ ggml_backend_remoting_buffer_type_get_max_size,
+  /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+  /* .is_host          = */ NULL,
+};
+
 /****************************************************************************************/
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index d35d5c9b66cd7..67dd06843495d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -6,6 +6,9 @@
 struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"};
 struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"};
 
+struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"};
+struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"};
+
 static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) {
   IMPLEMENTED_ONCE;
 
@@ -68,6 +71,32 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
   stop_timer(&get_tensor_timer);
 }
 
+static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  start_timer(&set_tensor_from_ptr_timer);
+
+  UNUSED(buffer);
+
+  memcpy((char *)tensor->data + offset, data, size);
+
+  stop_timer(&set_tensor_from_ptr_timer);
+
+  return;
+}
+
+static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+  IMPLEMENTED_ONCE;
+
+  UNUSED(buffer);
+
+  start_timer(&get_tensor_from_ptr_timer);
+
+  memcpy(data, (const char *)tensor->data + offset, size);
+
+  stop_timer(&get_tensor_from_ptr_timer);
+}
+
 static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
   NOT_IMPLEMENTED;
 
@@ -115,3 +144,15 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = {
   /* .clear           = */ ggml_backend_remoting_buffer_clear,
   /* .reset           = */ NULL,
 };
+
+const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = {
+  /* .free_buffer     = */ ggml_backend_remoting_buffer_free_buffer,
+  /* .get_base        = */ ggml_backend_remoting_buffer_get_base,
+  /* .init_tensor     = */ NULL,
+  /* .memset_tensor   = */ ggml_backend_remoting_buffer_memset_tensor,
+  /* .set_tensor      = */ ggml_backend_remoting_buffer_set_tensor_from_ptr,
+  /* .get_tensor      = */ ggml_backend_remoting_buffer_get_tensor_from_ptr,
+  /* .cpy_tensor      = */ ggml_backend_remoting_buffer_cpy_tensor,
+  /* .clear           = */ ggml_backend_remoting_buffer_clear,
+  /* .reset           = */ NULL,
+};
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 07c65276146f1..dfe1e992c9dac 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -103,11 +103,11 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
   // the API Remoting frontend
   props->caps.async = false;
   props->caps.host_buffer = false;
-  props->caps.buffer_from_host_ptr = true;
+  props->caps.buffer_from_host_ptr = false;
   props->caps.events = false;
 #endif
 
-  INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d",
+  INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d",
     __func__, props->caps.async, props->caps.host_buffer,
        props->caps.buffer_from_host_ptr, props->caps.events);
 }
@@ -129,6 +129,47 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) {
   return &buft;
 }
 
+static ggml_backend_buffer_type_t
+ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) {
+  IMPLEMENTED_ONCE;
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu);
+
+  static struct ggml_backend_buffer_type buft {
+    /* .iface    = */ ggml_backend_remoting_buffer_from_ptr_type_interface,
+    /* .device   = */ dev,
+    /* .context  = */ (void *) ctx,
+  };
+
+  return &buft;
+}
+
+static ggml_backend_buffer_t
+ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+
+  struct virtgpu *gpu = DEV_TO_GPU(dev);
+
+  struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
+  if (!context) {
+    FATAL("Couldn't allocate the buffer context ...");
+  }
+
+  context->gpu = gpu;
+  context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size);
+  context->base = ptr;
+  context->is_from_ptr = true;
+
+  ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size);
+
+  INFO("#");
+  INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer);
+  INFO("#\n");
+
+  return buffer;
+}
+
 static ggml_backend_buffer_type_t
 ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) {
   IMPLEMENTED_ONCE;
@@ -150,8 +191,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = {
   /* .get_props            = */ ggml_backend_remoting_device_get_props,
   /* .init_backend         = */ ggml_backend_remoting_device_init,
   /* .get_buffer_type      = */ ggml_backend_remoting_device_get_buffer_type,
-  /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type,
-  /* .buffer_from_host_ptr = */ NULL,
+  /* .get_host_buffer_type = */ NULL,
+  /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr,
   /* .supports_op          = */ ggml_backend_remoting_device_supports_op,
   /* .supports_buft        = */ ggml_backend_remoting_device_supports_buft,
   /* .offload_op           = */ ggml_backend_remoting_device_offload_op,

From efe68cace1c9b3671ca74aa9dcd4e78c028452d6 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 2 Jun 2025 16:29:47 +0200
Subject: [PATCH 101/117] remoting: make alloc_memory + alloc_from_host_ptr
 work :)

---
 .../backend-dispatched-device.cpp                 |  1 -
 .../ggml-backend-buffer-type.cpp                  | 15 ++++++++++++---
 .../ggml-remotingfrontend/ggml-backend-buffer.cpp | 15 ++++++++++++---
 .../virtgpu-forward-device.cpp                    |  3 ---
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 13d32194b1668..5bf0788ccf864 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -131,7 +131,6 @@ backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *
   ggml_backend_buffer_t buffer;
   buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size);
 
-  INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer, size);
   vn_encode_ggml_buffer(enc, buffer);
   vn_encode_ggml_buffer_type(enc, buffer->buft);
 
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 86ee8a8bf0f3b..70fc829c24fa4 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -14,10 +14,19 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   }
 
   context->gpu = gpu;
-  context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
-  context->base = NULL;
+
+  const int USE_FROM_PTR = true;
+
+  if (USE_FROM_PTR) {
+    context->apir_context = apir_device_buffer_from_ptr(gpu, size, size);
+    context->base = context->apir_context.shmem->mmap_ptr;
+    context->is_from_ptr = true;
+  } else {
+    context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size);
+    context->is_from_ptr = false;
+    context->base = NULL;
+  }
   context->is_host_buffer = false;
-  context->is_from_ptr = false;
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
   INFO("##");
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
index 67dd06843495d..e720efcf47c69 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp
@@ -52,7 +52,12 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer
   }
   INFO("\n");
 #endif
-  apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy((char *)tensor->data + offset, data, size);
+  } else {
+    apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
 
   stop_timer(&set_tensor_timer);
 
@@ -65,8 +70,12 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer
   start_timer(&get_tensor_timer);
 
   struct virtgpu *gpu = BUFFER_TO_GPU(buffer);
-
-  apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer);
+  if (context->is_from_ptr) {
+    memcpy(data, (const char *)tensor->data + offset, size);
+  } else {
+    apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size);
+  }
 
   stop_timer(&get_tensor_timer);
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 0d74b55c2083c..06ad6d445de4c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -209,8 +209,6 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu,
   struct vn_cs_decoder *decoder;
   apir_buffer_context_t buffer_context;
 
-  BEING_IMPLEMENTED;
-
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR);
 
   /* *** */
@@ -228,7 +226,6 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu,
   REMOTE_CALL(gpu, encoder, decoder);
 
   vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle);
-  INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer_context.host_handle, size);
   buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder);
 
   /* *** */

From 3769bb4171fa53ecba56121b54e8266c0cc879fb Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:51:05 +0200
Subject: [PATCH 102/117] build.backend: export SDKROOT to please apple
 compiler ...

---
 build.backend.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/build.backend.sh b/build.backend.sh
index 863f98e3524a3..dc0b6007e3123 100755
--- a/build.backend.sh
+++ b/build.backend.sh
@@ -10,6 +10,8 @@ else
     FLAVOR=""
 fi
 
+export SDKROOT=$(xcrun --sdk macosx --show-sdk-path)
+
 if [[ "$FLAVOR" == "-prod" ]]; then
     cat <<EOF
 ###

From 3a730d773de4e81dd3e1cdece3bcaf19551c339e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:52:53 +0200
Subject: [PATCH 103/117] prepare.backend.sh: more flags

---
 prepare.backend.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/prepare.backend.sh b/prepare.backend.sh
index caed8223382e9..8bc5be19e9343 100755
--- a/prepare.backend.sh
+++ b/prepare.backend.sh
@@ -8,6 +8,8 @@ cmake -S . -B ../build.remoting-backend$FLAVOR \
       -DGGML_REMOTINGBACKEND=ON \
       -DGGML_NATIVE=OFF \
       -DGGML_METAL=ON \
+      -DGGML_BACKEND_DL=OFF \
+      -DLLAMA_CURL=OFF \
       -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \
       "$@"
 

From 7ef077e303ca3dbc44411067703d38a15dd235e1 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:53:32 +0200
Subject: [PATCH 104/117] run.vulkan.sh: more flexible

---
 run.vulkan.sh | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/run.vulkan.sh b/run.vulkan.sh
index 1cd38ea58ef52..a84d4831d478f 100755
--- a/run.vulkan.sh
+++ b/run.vulkan.sh
@@ -1,10 +1,23 @@
 #! /bin/bash
-
-if [[ ${1:-} == "gdb" ]]; then
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
     prefix="gdb --args"
 else
     prefix=""
 fi
 
-export VN_DEBUG=init
-$prefix ../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing"
+rm -f /usr/lib64/libvulkan_virtio.so
+
+ICD_DIR=/Users/kevinpouget/.local/share/vulkan/icd.d
+
+USE_WORK_MESA=1
+if [[ "$USE_WORK_MESA" == 1 ]]; then
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.aarch64.json
+else
+    export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.good.aarch64.json
+fi
+
+# init result vtest wsi no_abort log_ctx_info cache no_sparse no_gpl
+export VN_DEBUG=vtest
+$prefix ../build.vulkan/bin/llama-run --verbose ~/models/llama3.2 "say nothing"

From 6d98572ea3b3b2a2824270b44634e49fd2159cf6 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:53:44 +0200
Subject: [PATCH 105/117] run.remoting.sh: more flexible

---
 run.remoting.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/run.remoting.sh b/run.remoting.sh
index 9a2a77f054210..9a8ce4d34c74a 100755
--- a/run.remoting.sh
+++ b/run.remoting.sh
@@ -1,6 +1,8 @@
 #! /bin/bash
 #clear
-if [[ ${1:-} == "gdb" ]]; then
+if [[ ${1:-} == "strace" ]]; then
+    prefix="strace"
+elif [[ ${1:-} == "gdb" ]]; then
     prefix="gdb --args"
 else
     prefix=""
@@ -41,7 +43,7 @@ if [[ "$bench" == yes ]]; then
         --n-gpu-layers 99
 else
     PROMPT="say nothing"
-    PROMPT="tell what's Apple metal API"
+    #PROMPT="tell what's Apple metal API"
     $prefix \
         $LLAMA_BUILD_DIR/bin/llama-run \
         --ngl 99 \

From 50326201f9fba299d6c11d5ea3cad026b9110239 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:54:00 +0200
Subject: [PATCH 106/117] prepare.vulkan.sh: more details

---
 prepare.vulkan.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh
index 29d0794ebe4e3..7bacf9b21a9ca 100644
--- a/prepare.vulkan.sh
+++ b/prepare.vulkan.sh
@@ -1 +1,6 @@
-cmake -S . -B ../build.vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF
+cmake -S . \
+      -B ../build.vulkan \
+      -DGGML_VULKAN=ON \
+      -DGGML_NATIVE=OFF \
+      -DGGML_METAL=OFF \
+      -DCMAKE_BUILD_TYPE=Debug

From eeba619c63bfce4093fc66dac093f06e99ed8664 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:55:45 +0200
Subject: [PATCH 107/117] ggml: src: ggml-remotingfrontend/virtgpu: don't
 include virglrenderer_hw.h

---
 ggml/src/ggml-remotingfrontend/virtgpu.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 26933c8a6eda4..32ad51237037c 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -18,9 +18,18 @@
 
 #define VIRGL_RENDERER_UNSTABLE_APIS 1
 #include "drm-uapi/virtgpu_drm.h"
-#include "virglrenderer_hw.h"
 #include "venus_hw.h"
 
+// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads
+enum virgl_renderer_capset {
+   VIRGL_RENDERER_CAPSET_VIRGL                   = 1,
+   VIRGL_RENDERER_CAPSET_VIRGL2                  = 2,
+   /* 3 is reserved for gfxstream */
+   VIRGL_RENDERER_CAPSET_VENUS                   = 4,
+   /* 5 is reserved for cross-domain */
+   VIRGL_RENDERER_CAPSET_DRM                     = 6,
+};
+
 /* from src/virtio/vulkan/vn_renderer_virtgpu.c */
 #define VIRTGPU_PCI_VENDOR_ID 0x1af4
 #define VIRTGPU_PCI_DEVICE_ID 0x1050

From 66b34d685eea20ada00ad6f2a61d36fd5cea1939 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 09:58:42 +0200
Subject: [PATCH 108/117] ggml: src: ggml-remotingfrontend/virtgpu: don't use
 absolute paths in include

---
 ggml/src/ggml-remotingfrontend/virtgpu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h
index 32ad51237037c..9d8668c3d070e 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu.h
@@ -11,8 +11,8 @@
 
 #include "virtgpu-forward.h"
 #include "virtgpu-utils.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h"
+#include "../ggml-remotingbackend/shared/api_remoting.h"
+#include "../ggml-remotingbackend/shared/venus_cs.h"
 
 #include "virtgpu-shm.h"
 

From 5b5ffec30bcd2592a8621aa67db82472b573b19e Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 14:13:02 +0200
Subject: [PATCH 109/117] remoting: rewrite to avoid hard-coded paths

---
 ggml/src/ggml-remotingbackend/backend.cpp     |   45 +-
 .../shared/api_remoting.h                     |    1 -
 .../shared/apir_backend.h                     |   16 +-
 ggml/src/ggml-remotingfrontend/CMakeLists.txt |   10 +-
 .../include/drm-uapi/drm.h                    | 1408 +++++++++++++++++
 .../include/drm-uapi/virtgpu_drm.h            |  276 ++++
 .../ggml-remotingfrontend/include/venus_hw.h  |   74 +
 .../virtgpu-forward-impl.h                    |    4 +-
 .../ggml-remotingfrontend/virtgpu-forward.h   |    2 +-
 9 files changed, 1794 insertions(+), 42 deletions(-)
 create mode 100644 ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
 create mode 100644 ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
 create mode 100644 ggml/src/ggml-remotingfrontend/include/venus_hw.h

diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp
index f5a10c234644a..95dee556cff3f 100644
--- a/ggml/src/ggml-remotingbackend/backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend.cpp
@@ -10,17 +10,10 @@
 #include "shared/apir_backend.h"
 #include "shared/venus_cs.h"
 
-#define USE_METAL 1
-
-#if USE_METAL
-#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib"
-#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg"
-#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init"
-#else
-#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-vulkan.dylib"
-#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_vk_reg"
-#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_vk_init"
-#endif
+#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH"
+#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG"
+#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT"
+
 
 static void *backend_library_handle = NULL;
 
@@ -52,9 +45,19 @@ extern "C" {
   uint32_t apir_backend_initialize() {
     const char* dlsym_error;
 
-    INFO("%s: hello " GGML_BACKEND_REG_FCT_NAME " :wave: \\o/", __func__);
+    const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV);
+    const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV);
+    const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV);
+
+    INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init);
+
+    if (!library_name) {
+      ERROR("Cannot open library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV);
 
-    backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY);
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    backend_library_handle = dlopen(library_name, RTLD_LAZY);
 
     if (!backend_library_handle) {
       ERROR("Cannot open library: %s\n", dlerror());
@@ -62,7 +65,13 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
     }
 
-    void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME);
+    if (!library_reg) {
+      ERROR("Cannot register library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_REG_ENV);
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg);
     dlsym_error = dlerror();
     if (dlsym_error) {
       ERROR("Cannot load symbol: %s\n", dlsym_error);
@@ -70,7 +79,13 @@ extern "C" {
       return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS;
     }
 
-    void *ggml_backend_init_fct = dlsym(backend_library_handle, GGML_BACKEND_INIT_FCT_NAME);
+    if (!library_init) {
+      ERROR("Cannot initialize library: env var '%s' not defined\n", library_init);
+
+      return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY;
+    }
+
+    void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init);
     dlsym_error = dlerror();
     if (dlsym_error) {
       ERROR("Cannot load symbol: %s\n", dlsym_error);
diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
index 1df5498c29c03..6e594a8ae4ab8 100644
--- a/ggml/src/ggml-remotingbackend/shared/api_remoting.h
+++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h
@@ -1,4 +1,3 @@
-
 #define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255
 #define VIRGL_APIR_COMMAND_TYPE_Forward 256
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index efd0803a929d5..4146908813c6d 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -1,10 +1,5 @@
 #pragma once
 
-#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend-prod/bin/libggml-remotingbackend.dylib"
-#define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize"
-#define APIR_DEINIT_FCT_NAME "apir_backend_deinit"
-#define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher"
-
 #define APIR_BACKEND_INITIALIZE_SUCCESSS 0
 #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1
 #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2
@@ -24,18 +19,9 @@ typedef struct {
   apir_buffer_type_host_handle_t buft_host_handle;
 } apir_buffer_context_t;
 
-typedef uint32_t (*apir_backend_initialize_t)(void);
-typedef void (*apir_backend_deinit_t)(void);
-
 struct vn_dispatch_context;
 struct virgl_apir_context;
 
-typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, struct virgl_apir_context *ctx,
-                                            char *dec_cur, const char *dec_end,
-                                            char *enc_cur, const char *enc_end,
-                                            char **enc_cur_after
-  );
-
 typedef enum ApirBackendCommandType {
   /* device */
   APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0,
@@ -72,7 +58,7 @@ typedef enum ApirBackendCommandType {
 
 struct virgl_apir_callbacks {
   void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id);
-} ;
+};
 
 struct virgl_apir_context {
   struct vn_dispatch_context *virgl_ctx;
diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
index 15b338f730176..f3f3dea652cf9 100644
--- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt
+++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt
@@ -22,17 +22,11 @@ ggml_add_backend_library(ggml-remotingfrontend
                          venus_cs_ggml-rpc-front.cpp
                         )
 
+# dnf install -y libdrm-devel
 target_link_libraries(ggml-remotingfrontend PUBLIC drm)
 target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/)
+target_include_directories(ggml-remotingfrontend PUBLIC ./include)
 
-set(REMOTING_PROJECT /Users/kevinpouget/remoting)
-set(MESA_PROJECT_HOME ${REMOTING_PROJECT}/mesa)
-set(MESA_PROJECT_SRC ${MESA_PROJECT_HOME}/src)
-
-target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_SRC}/virtio/virtio-gpu/)
-target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_HOME}/include)
 target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
 
 target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20)
-
-# dnf install -y libdrm-devel
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
new file mode 100644
index 0000000000000..4e4f7c2c39e4f
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h
@@ -0,0 +1,1408 @@
+/*
+ * Header for the Direct Rendering Manager
+ *
+ * Author: Rickard E. (Rik) Faith <faith@valinux.com>
+ *
+ * Acknowledgments:
+ * Dec 1999, Richard Henderson <rth@twiddle.net>, move to generic cmpxchg.
+ */
+
+/*
+ * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
+ * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DRM_H_
+#define _DRM_H_
+
+#if   defined(__linux__)
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+typedef unsigned int drm_handle_t;
+
+#else /* One of the BSDs */
+
+#include <stdint.h>
+#include <sys/ioccom.h>
+#include <sys/types.h>
+typedef int8_t   __s8;
+typedef uint8_t  __u8;
+typedef int16_t  __s16;
+typedef uint16_t __u16;
+typedef int32_t  __s32;
+typedef uint32_t __u32;
+typedef int64_t  __s64;
+typedef uint64_t __u64;
+typedef size_t   __kernel_size_t;
+typedef unsigned long drm_handle_t;
+
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_NAME	"drm"	  /**< Name in kernel, /dev, and /proc */
+#define DRM_MIN_ORDER	5	  /**< At least 2^5 bytes = 32 bytes */
+#define DRM_MAX_ORDER	22	  /**< Up to 2^22 bytes = 4MB */
+#define DRM_RAM_PERCENT 10	  /**< How much system ram can we lock? */
+
+#define _DRM_LOCK_HELD	0x80000000U /**< Hardware lock is held */
+#define _DRM_LOCK_CONT	0x40000000U /**< Hardware lock is contended */
+#define _DRM_LOCK_IS_HELD(lock)	   ((lock) & _DRM_LOCK_HELD)
+#define _DRM_LOCK_IS_CONT(lock)	   ((lock) & _DRM_LOCK_CONT)
+#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT))
+
+typedef unsigned int drm_context_t;
+typedef unsigned int drm_drawable_t;
+typedef unsigned int drm_magic_t;
+
+/*
+ * Cliprect.
+ *
+ * \warning: If you change this structure, make sure you change
+ * XF86DRIClipRectRec in the server as well
+ *
+ * \note KW: Actually it's illegal to change either for
+ * backwards-compatibility reasons.
+ */
+struct drm_clip_rect {
+	unsigned short x1;
+	unsigned short y1;
+	unsigned short x2;
+	unsigned short y2;
+};
+
+/*
+ * Drawable information.
+ */
+struct drm_drawable_info {
+	unsigned int num_rects;
+	struct drm_clip_rect *rects;
+};
+
+/*
+ * Texture region,
+ */
+struct drm_tex_region {
+	unsigned char next;
+	unsigned char prev;
+	unsigned char in_use;
+	unsigned char padding;
+	unsigned int age;
+};
+
+/*
+ * Hardware lock.
+ *
+ * The lock structure is a simple cache-line aligned integer.  To avoid
+ * processor bus contention on a multiprocessor system, there should not be any
+ * other data stored in the same cache line.
+ */
+struct drm_hw_lock {
+	__volatile__ unsigned int lock;		/**< lock variable */
+	char padding[60];			/**< Pad to cache line */
+};
+
+/*
+ * DRM_IOCTL_VERSION ioctl argument type.
+ *
+ * \sa drmGetVersion().
+ */
+struct drm_version {
+	int version_major;	  /**< Major version */
+	int version_minor;	  /**< Minor version */
+	int version_patchlevel;	  /**< Patch level */
+	__kernel_size_t name_len;	  /**< Length of name buffer */
+	char *name;	  /**< Name of driver */
+	__kernel_size_t date_len;	  /**< Length of date buffer */
+	char *date;	  /**< User-space buffer to hold date */
+	__kernel_size_t desc_len;	  /**< Length of desc buffer */
+	char *desc;	  /**< User-space buffer to hold desc */
+};
+
+/*
+ * DRM_IOCTL_GET_UNIQUE ioctl argument type.
+ *
+ * \sa drmGetBusid() and drmSetBusId().
+ */
+struct drm_unique {
+	__kernel_size_t unique_len;	  /**< Length of unique */
+	char *unique;	  /**< Unique name for driver instantiation */
+};
+
+struct drm_list {
+	int count;		  /**< Length of user-space structures */
+	struct drm_version *version;
+};
+
+struct drm_block {
+	int unused;
+};
+
+/*
+ * DRM_IOCTL_CONTROL ioctl argument type.
+ *
+ * \sa drmCtlInstHandler() and drmCtlUninstHandler().
+ */
+struct drm_control {
+	enum {
+		DRM_ADD_COMMAND,
+		DRM_RM_COMMAND,
+		DRM_INST_HANDLER,
+		DRM_UNINST_HANDLER
+	} func;
+	int irq;
+};
+
+/*
+ * Type of memory to map.
+ */
+enum drm_map_type {
+	_DRM_FRAME_BUFFER = 0,	  /**< WC (no caching), no core dump */
+	_DRM_REGISTERS = 1,	  /**< no caching, no core dump */
+	_DRM_SHM = 2,		  /**< shared, cached */
+	_DRM_AGP = 3,		  /**< AGP/GART */
+	_DRM_SCATTER_GATHER = 4,  /**< Scatter/gather memory for PCI DMA */
+	_DRM_CONSISTENT = 5	  /**< Consistent memory for PCI DMA */
+};
+
+/*
+ * Memory mapping flags.
+ */
+enum drm_map_flags {
+	_DRM_RESTRICTED = 0x01,	     /**< Cannot be mapped to user-virtual */
+	_DRM_READ_ONLY = 0x02,
+	_DRM_LOCKED = 0x04,	     /**< shared, cached, locked */
+	_DRM_KERNEL = 0x08,	     /**< kernel requires access */
+	_DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */
+	_DRM_CONTAINS_LOCK = 0x20,   /**< SHM page that contains lock */
+	_DRM_REMOVABLE = 0x40,	     /**< Removable mapping */
+	_DRM_DRIVER = 0x80	     /**< Managed by driver */
+};
+
+struct drm_ctx_priv_map {
+	unsigned int ctx_id;	 /**< Context requesting private mapping */
+	void *handle;		 /**< Handle of map */
+};
+
+/*
+ * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls
+ * argument type.
+ *
+ * \sa drmAddMap().
+ */
+struct drm_map {
+	unsigned long offset;	 /**< Requested physical address (0 for SAREA)*/
+	unsigned long size;	 /**< Requested physical size (bytes) */
+	enum drm_map_type type;	 /**< Type of memory to map */
+	enum drm_map_flags flags;	 /**< Flags */
+	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
+				 /**< Kernel-space: kernel-virtual address */
+	int mtrr;		 /**< MTRR slot used */
+	/*   Private data */
+};
+
+/*
+ * DRM_IOCTL_GET_CLIENT ioctl argument type.
+ */
+struct drm_client {
+	int idx;		/**< Which client desired? */
+	int auth;		/**< Is client authenticated? */
+	unsigned long pid;	/**< Process ID */
+	unsigned long uid;	/**< User ID */
+	unsigned long magic;	/**< Magic */
+	unsigned long iocs;	/**< Ioctl count */
+};
+
+enum drm_stat_type {
+	_DRM_STAT_LOCK,
+	_DRM_STAT_OPENS,
+	_DRM_STAT_CLOSES,
+	_DRM_STAT_IOCTLS,
+	_DRM_STAT_LOCKS,
+	_DRM_STAT_UNLOCKS,
+	_DRM_STAT_VALUE,	/**< Generic value */
+	_DRM_STAT_BYTE,		/**< Generic byte counter (1024bytes/K) */
+	_DRM_STAT_COUNT,	/**< Generic non-byte counter (1000/k) */
+
+	_DRM_STAT_IRQ,		/**< IRQ */
+	_DRM_STAT_PRIMARY,	/**< Primary DMA bytes */
+	_DRM_STAT_SECONDARY,	/**< Secondary DMA bytes */
+	_DRM_STAT_DMA,		/**< DMA */
+	_DRM_STAT_SPECIAL,	/**< Special DMA (e.g., priority or polled) */
+	_DRM_STAT_MISSED	/**< Missed DMA opportunity */
+	    /* Add to the *END* of the list */
+};
+
+/*
+ * DRM_IOCTL_GET_STATS ioctl argument type.
+ */
+struct drm_stats {
+	unsigned long count;
+	struct {
+		unsigned long value;
+		enum drm_stat_type type;
+	} data[15];
+};
+
+/*
+ * Hardware locking flags.
+ */
+enum drm_lock_flags {
+	_DRM_LOCK_READY = 0x01,	     /**< Wait until hardware is ready for DMA */
+	_DRM_LOCK_QUIESCENT = 0x02,  /**< Wait until hardware quiescent */
+	_DRM_LOCK_FLUSH = 0x04,	     /**< Flush this context's DMA queue first */
+	_DRM_LOCK_FLUSH_ALL = 0x08,  /**< Flush all DMA queues first */
+	/* These *HALT* flags aren't supported yet
+	   -- they will be used to support the
+	   full-screen DGA-like mode. */
+	_DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */
+	_DRM_HALT_CUR_QUEUES = 0x20  /**< Halt all current queues */
+};
+
+/*
+ * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type.
+ *
+ * \sa drmGetLock() and drmUnlock().
+ */
+struct drm_lock {
+	int context;
+	enum drm_lock_flags flags;
+};
+
+/*
+ * DMA flags
+ *
+ * \warning
+ * These values \e must match xf86drm.h.
+ *
+ * \sa drm_dma.
+ */
+enum drm_dma_flags {
+	/* Flags for DMA buffer dispatch */
+	_DRM_DMA_BLOCK = 0x01,	      /**<
+				       * Block until buffer dispatched.
+				       *
+				       * \note The buffer may not yet have
+				       * been processed by the hardware --
+				       * getting a hardware lock with the
+				       * hardware quiescent will ensure
+				       * that the buffer has been
+				       * processed.
+				       */
+	_DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */
+	_DRM_DMA_PRIORITY = 0x04,     /**< High priority dispatch */
+
+	/* Flags for DMA buffer request */
+	_DRM_DMA_WAIT = 0x10,	      /**< Wait for free buffers */
+	_DRM_DMA_SMALLER_OK = 0x20,   /**< Smaller-than-requested buffers OK */
+	_DRM_DMA_LARGER_OK = 0x40     /**< Larger-than-requested buffers OK */
+};
+
+/*
+ * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type.
+ *
+ * \sa drmAddBufs().
+ */
+struct drm_buf_desc {
+	int count;		 /**< Number of buffers of this size */
+	int size;		 /**< Size in bytes */
+	int low_mark;		 /**< Low water mark */
+	int high_mark;		 /**< High water mark */
+	enum {
+		_DRM_PAGE_ALIGN = 0x01,	/**< Align on page boundaries for DMA */
+		_DRM_AGP_BUFFER = 0x02,	/**< Buffer is in AGP space */
+		_DRM_SG_BUFFER = 0x04,	/**< Scatter/gather memory buffer */
+		_DRM_FB_BUFFER = 0x08,	/**< Buffer is in frame buffer */
+		_DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */
+	} flags;
+	unsigned long agp_start; /**<
+				  * Start address of where the AGP buffers are
+				  * in the AGP aperture
+				  */
+};
+
+/*
+ * DRM_IOCTL_INFO_BUFS ioctl argument type.
+ */
+struct drm_buf_info {
+	int count;		/**< Entries in list */
+	struct drm_buf_desc *list;
+};
+
+/*
+ * DRM_IOCTL_FREE_BUFS ioctl argument type.
+ */
+struct drm_buf_free {
+	int count;
+	int *list;
+};
+
+/*
+ * Buffer information
+ *
+ * \sa drm_buf_map.
+ */
+struct drm_buf_pub {
+	int idx;		       /**< Index into the master buffer list */
+	int total;		       /**< Buffer size */
+	int used;		       /**< Amount of buffer in use (for DMA) */
+	void *address;	       /**< Address of buffer */
+};
+
+/*
+ * DRM_IOCTL_MAP_BUFS ioctl argument type.
+ */
+struct drm_buf_map {
+	int count;		/**< Length of the buffer list */
+#ifdef __cplusplus
+	void *virt;
+#else
+	void *virtual;		/**< Mmap'd area in user-virtual */
+#endif
+	struct drm_buf_pub *list;	/**< Buffer information */
+};
+
+/*
+ * DRM_IOCTL_DMA ioctl argument type.
+ *
+ * Indices here refer to the offset into the buffer list in drm_buf_get.
+ *
+ * \sa drmDMA().
+ */
+struct drm_dma {
+	int context;			  /**< Context handle */
+	int send_count;			  /**< Number of buffers to send */
+	int *send_indices;	  /**< List of handles to buffers */
+	int *send_sizes;		  /**< Lengths of data to send */
+	enum drm_dma_flags flags;	  /**< Flags */
+	int request_count;		  /**< Number of buffers requested */
+	int request_size;		  /**< Desired size for buffers */
+	int *request_indices;	  /**< Buffer information */
+	int *request_sizes;
+	int granted_count;		  /**< Number of buffers granted */
+};
+
+enum drm_ctx_flags {
+	_DRM_CONTEXT_PRESERVED = 0x01,
+	_DRM_CONTEXT_2DONLY = 0x02
+};
+
+/*
+ * DRM_IOCTL_ADD_CTX ioctl argument type.
+ *
+ * \sa drmCreateContext() and drmDestroyContext().
+ */
+struct drm_ctx {
+	drm_context_t handle;
+	enum drm_ctx_flags flags;
+};
+
+/*
+ * DRM_IOCTL_RES_CTX ioctl argument type.
+ */
+struct drm_ctx_res {
+	int count;
+	struct drm_ctx *contexts;
+};
+
+/*
+ * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type.
+ */
+struct drm_draw {
+	drm_drawable_t handle;
+};
+
+/*
+ * DRM_IOCTL_UPDATE_DRAW ioctl argument type.
+ */
+typedef enum {
+	DRM_DRAWABLE_CLIPRECTS
+} drm_drawable_info_type_t;
+
+struct drm_update_draw {
+	drm_drawable_t handle;
+	unsigned int type;
+	unsigned int num;
+	unsigned long long data;
+};
+
+/*
+ * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type.
+ */
+struct drm_auth {
+	drm_magic_t magic;
+};
+
+/*
+ * DRM_IOCTL_IRQ_BUSID ioctl argument type.
+ *
+ * \sa drmGetInterruptFromBusID().
+ */
+struct drm_irq_busid {
+	int irq;	/**< IRQ number */
+	int busnum;	/**< bus number */
+	int devnum;	/**< device number */
+	int funcnum;	/**< function number */
+};
+
+enum drm_vblank_seq_type {
+	_DRM_VBLANK_ABSOLUTE = 0x0,	/**< Wait for specific vblank sequence number */
+	_DRM_VBLANK_RELATIVE = 0x1,	/**< Wait for given number of vblanks */
+	/* bits 1-6 are reserved for high crtcs */
+	_DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e,
+	_DRM_VBLANK_EVENT = 0x4000000,   /**< Send event instead of blocking */
+	_DRM_VBLANK_FLIP = 0x8000000,   /**< Scheduled buffer swap should flip */
+	_DRM_VBLANK_NEXTONMISS = 0x10000000,	/**< If missed, wait for next vblank */
+	_DRM_VBLANK_SECONDARY = 0x20000000,	/**< Secondary display controller */
+	_DRM_VBLANK_SIGNAL = 0x40000000	/**< Send signal instead of blocking, unsupported */
+};
+#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1
+
+#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE)
+#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \
+				_DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS)
+
+struct drm_wait_vblank_request {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	unsigned long signal;
+};
+
+struct drm_wait_vblank_reply {
+	enum drm_vblank_seq_type type;
+	unsigned int sequence;
+	long tval_sec;
+	long tval_usec;
+};
+
+/*
+ * DRM_IOCTL_WAIT_VBLANK ioctl argument type.
+ *
+ * \sa drmWaitVBlank().
+ */
+union drm_wait_vblank {
+	struct drm_wait_vblank_request request;
+	struct drm_wait_vblank_reply reply;
+};
+
+#define _DRM_PRE_MODESET 1
+#define _DRM_POST_MODESET 2
+
+/*
+ * DRM_IOCTL_MODESET_CTL ioctl argument type
+ *
+ * \sa drmModesetCtl().
+ */
+struct drm_modeset_ctl {
+	__u32 crtc;
+	__u32 cmd;
+};
+
+/*
+ * DRM_IOCTL_AGP_ENABLE ioctl argument type.
+ *
+ * \sa drmAgpEnable().
+ */
+struct drm_agp_mode {
+	unsigned long mode;	/**< AGP mode */
+};
+
+/*
+ * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type.
+ *
+ * \sa drmAgpAlloc() and drmAgpFree().
+ */
+struct drm_agp_buffer {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for binding / unbinding */
+	unsigned long type;	/**< Type of memory to allocate */
+	unsigned long physical;	/**< Physical used by i810 */
+};
+
+/*
+ * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type.
+ *
+ * \sa drmAgpBind() and drmAgpUnbind().
+ */
+struct drm_agp_binding {
+	unsigned long handle;	/**< From drm_agp_buffer */
+	unsigned long offset;	/**< In bytes -- will round to page boundary */
+};
+
+/*
+ * DRM_IOCTL_AGP_INFO ioctl argument type.
+ *
+ * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(),
+ * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(),
+ * drmAgpVendorId() and drmAgpDeviceId().
+ */
+struct drm_agp_info {
+	int agp_version_major;
+	int agp_version_minor;
+	unsigned long mode;
+	unsigned long aperture_base;	/* physical address */
+	unsigned long aperture_size;	/* bytes */
+	unsigned long memory_allowed;	/* bytes */
+	unsigned long memory_used;
+
+	/* PCI information */
+	unsigned short id_vendor;
+	unsigned short id_device;
+};
+
+/*
+ * DRM_IOCTL_SG_ALLOC ioctl argument type.
+ */
+struct drm_scatter_gather {
+	unsigned long size;	/**< In bytes -- will round to page boundary */
+	unsigned long handle;	/**< Used for mapping / unmapping */
+};
+
+/*
+ * DRM_IOCTL_SET_VERSION ioctl argument type.
+ */
+struct drm_set_version {
+	int drm_di_major;
+	int drm_di_minor;
+	int drm_dd_major;
+	int drm_dd_minor;
+};
+
+/* DRM_IOCTL_GEM_CLOSE ioctl argument type */
+struct drm_gem_close {
+	/** Handle of the object to be closed. */
+	__u32 handle;
+	__u32 pad;
+};
+
+/* DRM_IOCTL_GEM_FLINK ioctl argument type */
+struct drm_gem_flink {
+	/** Handle for the object being named */
+	__u32 handle;
+
+	/** Returned global name */
+	__u32 name;
+};
+
+/* DRM_IOCTL_GEM_OPEN ioctl argument type */
+struct drm_gem_open {
+	/** Name of object being opened */
+	__u32 name;
+
+	/** Returned handle for the object */
+	__u32 handle;
+
+	/** Returned size of the object */
+	__u64 size;
+};
+
+/**
+ * DRM_CAP_DUMB_BUFFER
+ *
+ * If set to 1, the driver supports creating dumb buffers via the
+ * &DRM_IOCTL_MODE_CREATE_DUMB ioctl.
+ */
+#define DRM_CAP_DUMB_BUFFER		0x1
+/**
+ * DRM_CAP_VBLANK_HIGH_CRTC
+ *
+ * If set to 1, the kernel supports specifying a :ref:`CRTC index<crtc_index>`
+ * in the high bits of &drm_wait_vblank_request.type.
+ *
+ * Starting kernel version 2.6.39, this capability is always set to 1.
+ */
+#define DRM_CAP_VBLANK_HIGH_CRTC	0x2
+/**
+ * DRM_CAP_DUMB_PREFERRED_DEPTH
+ *
+ * The preferred bit depth for dumb buffers.
+ *
+ * The bit depth is the number of bits used to indicate the color of a single
+ * pixel excluding any padding. This is different from the number of bits per
+ * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per
+ * pixel.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFERRED_DEPTH	0x3
+/**
+ * DRM_CAP_DUMB_PREFER_SHADOW
+ *
+ * If set to 1, the driver prefers userspace to render to a shadow buffer
+ * instead of directly rendering to a dumb buffer. For best speed, userspace
+ * should do streaming ordered memory copies into the dumb buffer and never
+ * read from it.
+ *
+ * Note that this preference only applies to dumb buffers, it's irrelevant for
+ * other types of buffers.
+ */
+#define DRM_CAP_DUMB_PREFER_SHADOW	0x4
+/**
+ * DRM_CAP_PRIME
+ *
+ * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT
+ * and &DRM_PRIME_CAP_EXPORT.
+ *
+ * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and
+ * &DRM_PRIME_CAP_EXPORT are always advertised.
+ *
+ * PRIME buffers are exposed as dma-buf file descriptors.
+ * See :ref:`prime_buffer_sharing`.
+ */
+#define DRM_CAP_PRIME			0x5
+/**
+ * DRM_PRIME_CAP_IMPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME
+ * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_IMPORT		0x1
+/**
+ * DRM_PRIME_CAP_EXPORT
+ *
+ * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME
+ * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl.
+ *
+ * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME.
+ */
+#define  DRM_PRIME_CAP_EXPORT		0x2
+/**
+ * DRM_CAP_TIMESTAMP_MONOTONIC
+ *
+ * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in
+ * struct drm_event_vblank. If set to 1, the kernel will report timestamps with
+ * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these
+ * clocks.
+ *
+ * Starting from kernel version 2.6.39, the default value for this capability
+ * is 1. Starting kernel version 4.15, this capability is always set to 1.
+ */
+#define DRM_CAP_TIMESTAMP_MONOTONIC	0x6
+/**
+ * DRM_CAP_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy
+ * page-flips.
+ */
+#define DRM_CAP_ASYNC_PAGE_FLIP		0x7
+/**
+ * DRM_CAP_CURSOR_WIDTH
+ *
+ * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid
+ * width x height combination for the hardware cursor. The intention is that a
+ * hardware agnostic userspace can query a cursor plane size to use.
+ *
+ * Note that the cross-driver contract is to merely return a valid size;
+ * drivers are free to attach another meaning on top, eg. i915 returns the
+ * maximum plane size.
+ */
+#define DRM_CAP_CURSOR_WIDTH		0x8
+/**
+ * DRM_CAP_CURSOR_HEIGHT
+ *
+ * See &DRM_CAP_CURSOR_WIDTH.
+ */
+#define DRM_CAP_CURSOR_HEIGHT		0x9
+/**
+ * DRM_CAP_ADDFB2_MODIFIERS
+ *
+ * If set to 1, the driver supports supplying modifiers in the
+ * &DRM_IOCTL_MODE_ADDFB2 ioctl.
+ */
+#define DRM_CAP_ADDFB2_MODIFIERS	0x10
+/**
+ * DRM_CAP_PAGE_FLIP_TARGET
+ *
+ * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and
+ * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in
+ * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP
+ * ioctl.
+ */
+#define DRM_CAP_PAGE_FLIP_TARGET	0x11
+/**
+ * DRM_CAP_CRTC_IN_VBLANK_EVENT
+ *
+ * If set to 1, the kernel supports reporting the CRTC ID in
+ * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and
+ * &DRM_EVENT_FLIP_COMPLETE events.
+ *
+ * Starting kernel version 4.12, this capability is always set to 1.
+ */
+#define DRM_CAP_CRTC_IN_VBLANK_EVENT	0x12
+/**
+ * DRM_CAP_SYNCOBJ
+ *
+ * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ		0x13
+/**
+ * DRM_CAP_SYNCOBJ_TIMELINE
+ *
+ * If set to 1, the driver supports timeline operations on sync objects. See
+ * :ref:`drm_sync_objects`.
+ */
+#define DRM_CAP_SYNCOBJ_TIMELINE	0x14
+/**
+ * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP
+ *
+ * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic
+ * commits.
+ */
+#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP	0x15
+
+/* DRM_IOCTL_GET_CAP ioctl argument type */
+struct drm_get_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+/**
+ * DRM_CLIENT_CAP_STEREO_3D
+ *
+ * If set to 1, the DRM core will expose the stereo 3D capabilities of the
+ * monitor by advertising the supported 3D layouts in the flags of struct
+ * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 3.13.
+ */
+#define DRM_CLIENT_CAP_STEREO_3D	1
+
+/**
+ * DRM_CLIENT_CAP_UNIVERSAL_PLANES
+ *
+ * If set to 1, the DRM core will expose all planes (overlay, primary, and
+ * cursor) to userspace.
+ *
+ * This capability has been introduced in kernel version 3.15. Starting from
+ * kernel version 3.17, this capability is always supported for all drivers.
+ */
+#define DRM_CLIENT_CAP_UNIVERSAL_PLANES  2
+
+/**
+ * DRM_CLIENT_CAP_ATOMIC
+ *
+ * If set to 1, the DRM core will expose atomic properties to userspace. This
+ * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and
+ * &DRM_CLIENT_CAP_ASPECT_RATIO.
+ *
+ * If the driver doesn't support atomic mode-setting, enabling this capability
+ * will fail with -EOPNOTSUPP.
+ *
+ * This capability has been introduced in kernel version 4.0. Starting from
+ * kernel version 4.2, this capability is always supported for atomic-capable
+ * drivers.
+ */
+#define DRM_CLIENT_CAP_ATOMIC	3
+
+/**
+ * DRM_CLIENT_CAP_ASPECT_RATIO
+ *
+ * If set to 1, the DRM core will provide aspect ratio information in modes.
+ * See ``DRM_MODE_FLAG_PIC_AR_*``.
+ *
+ * This capability is always supported for all drivers starting from kernel
+ * version 4.18.
+ */
+#define DRM_CLIENT_CAP_ASPECT_RATIO    4
+
+/**
+ * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS
+ *
+ * If set to 1, the DRM core will expose special connectors to be used for
+ * writing back to memory the scene setup in the commit. The client must enable
+ * &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * This capability is always supported for atomic-capable drivers starting from
+ * kernel version 4.19.
+ */
+#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS	5
+
+/**
+ * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT
+ *
+ * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and
+ * virtualbox) have additional restrictions for cursor planes (thus
+ * making cursor planes on those drivers not truly universal,) e.g.
+ * they need cursor planes to act like one would expect from a mouse
+ * cursor and have correctly set hotspot properties.
+ * If this client cap is not set the DRM core will hide cursor plane on
+ * those virtualized drivers because not setting it implies that the
+ * client is not capable of dealing with those extra restictions.
+ * Clients which do set cursor hotspot and treat the cursor plane
+ * like a mouse cursor should set this property.
+ * The client must enable &DRM_CLIENT_CAP_ATOMIC first.
+ *
+ * Setting this property on drivers which do not special case
+ * cursor planes (i.e. non-virtualized drivers) will return
+ * EOPNOTSUPP, which can be used by userspace to gauge
+ * requirements of the hardware/drivers they're running on.
+ *
+ * This capability is always supported for atomic-capable virtualized
+ * drivers starting from kernel version 6.6.
+ */
+#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT	6
+
+/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */
+struct drm_set_client_cap {
+	__u64 capability;
+	__u64 value;
+};
+
+#define DRM_RDWR O_RDWR
+#define DRM_CLOEXEC O_CLOEXEC
+struct drm_prime_handle {
+	__u32 handle;
+
+	/** Flags.. only applicable for handle->fd */
+	__u32 flags;
+
+	/** Returned dmabuf file descriptor */
+	__s32 fd;
+};
+
+struct drm_syncobj_create {
+	__u32 handle;
+#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0)
+	__u32 flags;
+};
+
+struct drm_syncobj_destroy {
+	__u32 handle;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0)
+#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0)
+struct drm_syncobj_handle {
+	__u32 handle;
+	__u32 flags;
+
+	__s32 fd;
+	__u32 pad;
+};
+
+struct drm_syncobj_transfer {
+	__u32 src_handle;
+	__u32 dst_handle;
+	__u64 src_point;
+	__u64 dst_point;
+	__u32 flags;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1)
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */
+#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */
+struct drm_syncobj_wait {
+	__u64 handles;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+struct drm_syncobj_timeline_wait {
+	__u64 handles;
+	/* wait on specific timeline point for every handles*/
+	__u64 points;
+	/* absolute timeout */
+	__s64 timeout_nsec;
+	__u32 count_handles;
+	__u32 flags;
+	__u32 first_signaled; /* only valid when not waiting all */
+	__u32 pad;
+	/**
+	 * @deadline_nsec - fence deadline hint
+	 *
+	 * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing
+	 * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is
+	 * set.
+	 */
+	__u64 deadline_nsec;
+};
+
+/**
+ * struct drm_syncobj_eventfd
+ * @handle: syncobj handle.
+ * @flags: Zero to wait for the point to be signalled, or
+ *         &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be
+ *         available for the point.
+ * @point: syncobj timeline point (set to zero for binary syncobjs).
+ * @fd: Existing eventfd to sent events to.
+ * @pad: Must be zero.
+ *
+ * Register an eventfd to be signalled by a syncobj. The eventfd counter will
+ * be incremented by one.
+ */
+struct drm_syncobj_eventfd {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+	__s32 fd;
+	__u32 pad;
+};
+
+
+struct drm_syncobj_array {
+	__u64 handles;
+	__u32 count_handles;
+	__u32 pad;
+};
+
+#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */
+struct drm_syncobj_timeline_array {
+	__u64 handles;
+	__u64 points;
+	__u32 count_handles;
+	__u32 flags;
+};
+
+
+/* Query current scanout sequence number */
+struct drm_crtc_get_sequence {
+	__u32 crtc_id;		/* requested crtc_id */
+	__u32 active;		/* return: crtc output is active */
+	__u64 sequence;		/* return: most recent vblank sequence */
+	__s64 sequence_ns;	/* return: most recent time of first pixel out */
+};
+
+/* Queue event to be delivered at specified sequence. Time stamp marks
+ * when the first pixel of the refresh cycle leaves the display engine
+ * for the display
+ */
+#define DRM_CRTC_SEQUENCE_RELATIVE		0x00000001	/* sequence is relative to current */
+#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS		0x00000002	/* Use next sequence if we've missed */
+
+struct drm_crtc_queue_sequence {
+	__u32 crtc_id;
+	__u32 flags;
+	__u64 sequence;		/* on input, target sequence. on output, actual sequence */
+	__u64 user_data;	/* user data passed to event */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#include "drm_mode.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#define DRM_IOCTL_BASE			'd'
+#define DRM_IO(nr)			_IO(DRM_IOCTL_BASE,nr)
+#define DRM_IOR(nr,type)		_IOR(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOW(nr,type)		_IOW(DRM_IOCTL_BASE,nr,type)
+#define DRM_IOWR(nr,type)		_IOWR(DRM_IOCTL_BASE,nr,type)
+
+#define DRM_IOCTL_VERSION		DRM_IOWR(0x00, struct drm_version)
+#define DRM_IOCTL_GET_UNIQUE		DRM_IOWR(0x01, struct drm_unique)
+#define DRM_IOCTL_GET_MAGIC		DRM_IOR( 0x02, struct drm_auth)
+#define DRM_IOCTL_IRQ_BUSID		DRM_IOWR(0x03, struct drm_irq_busid)
+#define DRM_IOCTL_GET_MAP               DRM_IOWR(0x04, struct drm_map)
+#define DRM_IOCTL_GET_CLIENT            DRM_IOWR(0x05, struct drm_client)
+#define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
+#define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
+#define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
+#define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
+#define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
+#define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
+#define DRM_IOCTL_GET_CAP		DRM_IOWR(0x0c, struct drm_get_cap)
+#define DRM_IOCTL_SET_CLIENT_CAP	DRM_IOW( 0x0d, struct drm_set_client_cap)
+
+#define DRM_IOCTL_SET_UNIQUE		DRM_IOW( 0x10, struct drm_unique)
+#define DRM_IOCTL_AUTH_MAGIC		DRM_IOW( 0x11, struct drm_auth)
+#define DRM_IOCTL_BLOCK			DRM_IOWR(0x12, struct drm_block)
+#define DRM_IOCTL_UNBLOCK		DRM_IOWR(0x13, struct drm_block)
+#define DRM_IOCTL_CONTROL		DRM_IOW( 0x14, struct drm_control)
+#define DRM_IOCTL_ADD_MAP		DRM_IOWR(0x15, struct drm_map)
+#define DRM_IOCTL_ADD_BUFS		DRM_IOWR(0x16, struct drm_buf_desc)
+#define DRM_IOCTL_MARK_BUFS		DRM_IOW( 0x17, struct drm_buf_desc)
+#define DRM_IOCTL_INFO_BUFS		DRM_IOWR(0x18, struct drm_buf_info)
+#define DRM_IOCTL_MAP_BUFS		DRM_IOWR(0x19, struct drm_buf_map)
+#define DRM_IOCTL_FREE_BUFS		DRM_IOW( 0x1a, struct drm_buf_free)
+
+#define DRM_IOCTL_RM_MAP		DRM_IOW( 0x1b, struct drm_map)
+
+#define DRM_IOCTL_SET_SAREA_CTX		DRM_IOW( 0x1c, struct drm_ctx_priv_map)
+#define DRM_IOCTL_GET_SAREA_CTX 	DRM_IOWR(0x1d, struct drm_ctx_priv_map)
+
+#define DRM_IOCTL_SET_MASTER            DRM_IO(0x1e)
+#define DRM_IOCTL_DROP_MASTER           DRM_IO(0x1f)
+
+#define DRM_IOCTL_ADD_CTX		DRM_IOWR(0x20, struct drm_ctx)
+#define DRM_IOCTL_RM_CTX		DRM_IOWR(0x21, struct drm_ctx)
+#define DRM_IOCTL_MOD_CTX		DRM_IOW( 0x22, struct drm_ctx)
+#define DRM_IOCTL_GET_CTX		DRM_IOWR(0x23, struct drm_ctx)
+#define DRM_IOCTL_SWITCH_CTX		DRM_IOW( 0x24, struct drm_ctx)
+#define DRM_IOCTL_NEW_CTX		DRM_IOW( 0x25, struct drm_ctx)
+#define DRM_IOCTL_RES_CTX		DRM_IOWR(0x26, struct drm_ctx_res)
+#define DRM_IOCTL_ADD_DRAW		DRM_IOWR(0x27, struct drm_draw)
+#define DRM_IOCTL_RM_DRAW		DRM_IOWR(0x28, struct drm_draw)
+#define DRM_IOCTL_DMA			DRM_IOWR(0x29, struct drm_dma)
+#define DRM_IOCTL_LOCK			DRM_IOW( 0x2a, struct drm_lock)
+#define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
+#define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
+
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
+#define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
+#define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
+
+#define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
+#define DRM_IOCTL_AGP_RELEASE		DRM_IO(  0x31)
+#define DRM_IOCTL_AGP_ENABLE		DRM_IOW( 0x32, struct drm_agp_mode)
+#define DRM_IOCTL_AGP_INFO		DRM_IOR( 0x33, struct drm_agp_info)
+#define DRM_IOCTL_AGP_ALLOC		DRM_IOWR(0x34, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_FREE		DRM_IOW( 0x35, struct drm_agp_buffer)
+#define DRM_IOCTL_AGP_BIND		DRM_IOW( 0x36, struct drm_agp_binding)
+#define DRM_IOCTL_AGP_UNBIND		DRM_IOW( 0x37, struct drm_agp_binding)
+
+#define DRM_IOCTL_SG_ALLOC		DRM_IOWR(0x38, struct drm_scatter_gather)
+#define DRM_IOCTL_SG_FREE		DRM_IOW( 0x39, struct drm_scatter_gather)
+
+#define DRM_IOCTL_WAIT_VBLANK		DRM_IOWR(0x3a, union drm_wait_vblank)
+
+#define DRM_IOCTL_CRTC_GET_SEQUENCE	DRM_IOWR(0x3b, struct drm_crtc_get_sequence)
+#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE	DRM_IOWR(0x3c, struct drm_crtc_queue_sequence)
+
+#define DRM_IOCTL_UPDATE_DRAW		DRM_IOW(0x3f, struct drm_update_draw)
+
+#define DRM_IOCTL_MODE_GETRESOURCES	DRM_IOWR(0xA0, struct drm_mode_card_res)
+#define DRM_IOCTL_MODE_GETCRTC		DRM_IOWR(0xA1, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_SETCRTC		DRM_IOWR(0xA2, struct drm_mode_crtc)
+#define DRM_IOCTL_MODE_CURSOR		DRM_IOWR(0xA3, struct drm_mode_cursor)
+#define DRM_IOCTL_MODE_GETGAMMA		DRM_IOWR(0xA4, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_SETGAMMA		DRM_IOWR(0xA5, struct drm_mode_crtc_lut)
+#define DRM_IOCTL_MODE_GETENCODER	DRM_IOWR(0xA6, struct drm_mode_get_encoder)
+#define DRM_IOCTL_MODE_GETCONNECTOR	DRM_IOWR(0xA7, struct drm_mode_get_connector)
+#define DRM_IOCTL_MODE_ATTACHMODE	DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+#define DRM_IOCTL_MODE_DETACHMODE	DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */
+
+#define DRM_IOCTL_MODE_GETPROPERTY	DRM_IOWR(0xAA, struct drm_mode_get_property)
+#define DRM_IOCTL_MODE_SETPROPERTY	DRM_IOWR(0xAB, struct drm_mode_connector_set_property)
+#define DRM_IOCTL_MODE_GETPROPBLOB	DRM_IOWR(0xAC, struct drm_mode_get_blob)
+#define DRM_IOCTL_MODE_GETFB		DRM_IOWR(0xAD, struct drm_mode_fb_cmd)
+#define DRM_IOCTL_MODE_ADDFB		DRM_IOWR(0xAE, struct drm_mode_fb_cmd)
+/**
+ * DRM_IOCTL_MODE_RMFB - Remove a framebuffer.
+ *
+ * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * Warning: removing a framebuffer currently in-use on an enabled plane will
+ * disable that plane. The CRTC the plane is linked to may also be disabled
+ * (depending on driver capabilities).
+ */
+#define DRM_IOCTL_MODE_RMFB		DRM_IOWR(0xAF, unsigned int)
+#define DRM_IOCTL_MODE_PAGE_FLIP	DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip)
+#define DRM_IOCTL_MODE_DIRTYFB		DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd)
+
+/**
+ * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object.
+ *
+ * KMS dumb buffers provide a very primitive way to allocate a buffer object
+ * suitable for scanout and map it for software rendering. KMS dumb buffers are
+ * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb
+ * buffers are not suitable to be displayed on any other device than the KMS
+ * device where they were allocated from. Also see
+ * :ref:`kms_dumb_buffer_objects`.
+ *
+ * The IOCTL argument is a struct drm_mode_create_dumb.
+ *
+ * User-space is expected to create a KMS dumb buffer via this IOCTL, then add
+ * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via
+ * &DRM_IOCTL_MODE_MAP_DUMB.
+ *
+ * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported.
+ * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate
+ * driver preferences for dumb buffers.
+ */
+#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb)
+#define DRM_IOCTL_MODE_MAP_DUMB    DRM_IOWR(0xB3, struct drm_mode_map_dumb)
+#define DRM_IOCTL_MODE_DESTROY_DUMB    DRM_IOWR(0xB4, struct drm_mode_destroy_dumb)
+#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res)
+#define DRM_IOCTL_MODE_GETPLANE	DRM_IOWR(0xB6, struct drm_mode_get_plane)
+#define DRM_IOCTL_MODE_SETPLANE	DRM_IOWR(0xB7, struct drm_mode_set_plane)
+#define DRM_IOCTL_MODE_ADDFB2		DRM_IOWR(0xB8, struct drm_mode_fb_cmd2)
+#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES	DRM_IOWR(0xB9, struct drm_mode_obj_get_properties)
+#define DRM_IOCTL_MODE_OBJ_SETPROPERTY	DRM_IOWR(0xBA, struct drm_mode_obj_set_property)
+#define DRM_IOCTL_MODE_CURSOR2		DRM_IOWR(0xBB, struct drm_mode_cursor2)
+#define DRM_IOCTL_MODE_ATOMIC		DRM_IOWR(0xBC, struct drm_mode_atomic)
+#define DRM_IOCTL_MODE_CREATEPROPBLOB	DRM_IOWR(0xBD, struct drm_mode_create_blob)
+#define DRM_IOCTL_MODE_DESTROYPROPBLOB	DRM_IOWR(0xBE, struct drm_mode_destroy_blob)
+
+#define DRM_IOCTL_SYNCOBJ_CREATE	DRM_IOWR(0xBF, struct drm_syncobj_create)
+#define DRM_IOCTL_SYNCOBJ_DESTROY	DRM_IOWR(0xC0, struct drm_syncobj_destroy)
+#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD	DRM_IOWR(0xC1, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE	DRM_IOWR(0xC2, struct drm_syncobj_handle)
+#define DRM_IOCTL_SYNCOBJ_WAIT		DRM_IOWR(0xC3, struct drm_syncobj_wait)
+#define DRM_IOCTL_SYNCOBJ_RESET		DRM_IOWR(0xC4, struct drm_syncobj_array)
+#define DRM_IOCTL_SYNCOBJ_SIGNAL	DRM_IOWR(0xC5, struct drm_syncobj_array)
+
+#define DRM_IOCTL_MODE_CREATE_LEASE	DRM_IOWR(0xC6, struct drm_mode_create_lease)
+#define DRM_IOCTL_MODE_LIST_LESSEES	DRM_IOWR(0xC7, struct drm_mode_list_lessees)
+#define DRM_IOCTL_MODE_GET_LEASE	DRM_IOWR(0xC8, struct drm_mode_get_lease)
+#define DRM_IOCTL_MODE_REVOKE_LEASE	DRM_IOWR(0xC9, struct drm_mode_revoke_lease)
+
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT	DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait)
+#define DRM_IOCTL_SYNCOBJ_QUERY		DRM_IOWR(0xCB, struct drm_syncobj_timeline_array)
+#define DRM_IOCTL_SYNCOBJ_TRANSFER	DRM_IOWR(0xCC, struct drm_syncobj_transfer)
+#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL	DRM_IOWR(0xCD, struct drm_syncobj_timeline_array)
+
+/**
+ * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata.
+ *
+ * This queries metadata about a framebuffer. User-space fills
+ * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the
+ * struct as the output.
+ *
+ * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
+ *
+ * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
+ * until one has a zero &drm_mode_fb_cmd2.pitches.
+ *
+ * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
+ * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
+ * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
+ */
+#define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
+
+#define DRM_IOCTL_SYNCOBJ_EVENTFD	DRM_IOWR(0xCF, struct drm_syncobj_eventfd)
+
+/**
+ * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer.
+ *
+ * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL
+ * argument is a framebuffer object ID.
+ *
+ * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable
+ * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept
+ * alive. When the plane no longer uses the framebuffer (because the
+ * framebuffer is replaced with another one, or the plane is disabled), the
+ * framebuffer is cleaned up.
+ *
+ * This is useful to implement flicker-free transitions between two processes.
+ *
+ * Depending on the threat model, user-space may want to ensure that the
+ * framebuffer doesn't expose any sensitive user information: closed
+ * framebuffers attached to a plane can be read back by the next DRM master.
+ */
+#define DRM_IOCTL_MODE_CLOSEFB		DRM_IOWR(0xD0, struct drm_mode_closefb)
+
+/*
+ * Device specific ioctls should only be in their respective headers
+ * The device specific ioctl range is from 0x40 to 0x9f.
+ * Generic IOCTLS restart at 0xA0.
+ *
+ * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and
+ * drmCommandReadWrite().
+ */
+#define DRM_COMMAND_BASE                0x40
+#define DRM_COMMAND_END			0xA0
+
+/**
+ * struct drm_event - Header for DRM events
+ * @type: event type.
+ * @length: total number of payload bytes (including header).
+ *
+ * This struct is a header for events written back to user-space on the DRM FD.
+ * A read on the DRM FD will always only return complete events: e.g. if the
+ * read buffer is 100 bytes large and there are two 64 byte events pending,
+ * only one will be returned.
+ *
+ * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and
+ * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK,
+ * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE.
+ */
+struct drm_event {
+	__u32 type;
+	__u32 length;
+};
+
+/**
+ * DRM_EVENT_VBLANK - vertical blanking event
+ *
+ * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the
+ * &_DRM_VBLANK_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_VBLANK 0x01
+/**
+ * DRM_EVENT_FLIP_COMPLETE - page-flip completion event
+ *
+ * This event is sent in response to an atomic commit or legacy page-flip with
+ * the &DRM_MODE_PAGE_FLIP_EVENT flag set.
+ *
+ * The event payload is a struct drm_event_vblank.
+ */
+#define DRM_EVENT_FLIP_COMPLETE 0x02
+/**
+ * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event
+ *
+ * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE.
+ *
+ * The event payload is a struct drm_event_crtc_sequence.
+ */
+#define DRM_EVENT_CRTC_SEQUENCE	0x03
+
+struct drm_event_vblank {
+	struct drm_event base;
+	__u64 user_data;
+	__u32 tv_sec;
+	__u32 tv_usec;
+	__u32 sequence;
+	__u32 crtc_id; /* 0 on older kernels that do not support this */
+};
+
+/* Event delivered at sequence. Time stamp marks when the first pixel
+ * of the refresh cycle leaves the display engine for the display
+ */
+struct drm_event_crtc_sequence {
+	struct drm_event	base;
+	__u64			user_data;
+	__s64			time_ns;
+	__u64			sequence;
+};
+
+/* typedef area */
+typedef struct drm_clip_rect drm_clip_rect_t;
+typedef struct drm_drawable_info drm_drawable_info_t;
+typedef struct drm_tex_region drm_tex_region_t;
+typedef struct drm_hw_lock drm_hw_lock_t;
+typedef struct drm_version drm_version_t;
+typedef struct drm_unique drm_unique_t;
+typedef struct drm_list drm_list_t;
+typedef struct drm_block drm_block_t;
+typedef struct drm_control drm_control_t;
+typedef enum drm_map_type drm_map_type_t;
+typedef enum drm_map_flags drm_map_flags_t;
+typedef struct drm_ctx_priv_map drm_ctx_priv_map_t;
+typedef struct drm_map drm_map_t;
+typedef struct drm_client drm_client_t;
+typedef enum drm_stat_type drm_stat_type_t;
+typedef struct drm_stats drm_stats_t;
+typedef enum drm_lock_flags drm_lock_flags_t;
+typedef struct drm_lock drm_lock_t;
+typedef enum drm_dma_flags drm_dma_flags_t;
+typedef struct drm_buf_desc drm_buf_desc_t;
+typedef struct drm_buf_info drm_buf_info_t;
+typedef struct drm_buf_free drm_buf_free_t;
+typedef struct drm_buf_pub drm_buf_pub_t;
+typedef struct drm_buf_map drm_buf_map_t;
+typedef struct drm_dma drm_dma_t;
+typedef union drm_wait_vblank drm_wait_vblank_t;
+typedef struct drm_agp_mode drm_agp_mode_t;
+typedef enum drm_ctx_flags drm_ctx_flags_t;
+typedef struct drm_ctx drm_ctx_t;
+typedef struct drm_ctx_res drm_ctx_res_t;
+typedef struct drm_draw drm_draw_t;
+typedef struct drm_update_draw drm_update_draw_t;
+typedef struct drm_auth drm_auth_t;
+typedef struct drm_irq_busid drm_irq_busid_t;
+typedef enum drm_vblank_seq_type drm_vblank_seq_type_t;
+
+typedef struct drm_agp_buffer drm_agp_buffer_t;
+typedef struct drm_agp_binding drm_agp_binding_t;
+typedef struct drm_agp_info drm_agp_info_t;
+typedef struct drm_scatter_gather drm_scatter_gather_t;
+typedef struct drm_set_version drm_set_version_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
new file mode 100644
index 0000000000000..9debb320c34be
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2013 Red Hat
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRTGPU_DRM_H
+#define VIRTGPU_DRM_H
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ *
+ * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel
+ * compatibility Keep fields aligned to their size
+ */
+
+#define DRM_VIRTGPU_MAP         0x01
+#define DRM_VIRTGPU_EXECBUFFER  0x02
+#define DRM_VIRTGPU_GETPARAM    0x03
+#define DRM_VIRTGPU_RESOURCE_CREATE 0x04
+#define DRM_VIRTGPU_RESOURCE_INFO     0x05
+#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
+#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
+#define DRM_VIRTGPU_WAIT     0x08
+#define DRM_VIRTGPU_GET_CAPS  0x09
+#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a
+#define DRM_VIRTGPU_CONTEXT_INIT 0x0b
+
+#define VIRTGPU_EXECBUF_FENCE_FD_IN	0x01
+#define VIRTGPU_EXECBUF_FENCE_FD_OUT	0x02
+#define VIRTGPU_EXECBUF_RING_IDX	0x04
+#define VIRTGPU_EXECBUF_FLAGS  (\
+		VIRTGPU_EXECBUF_FENCE_FD_IN |\
+		VIRTGPU_EXECBUF_FENCE_FD_OUT |\
+		VIRTGPU_EXECBUF_RING_IDX |\
+		0)
+
+struct drm_virtgpu_map {
+	__u64 offset; /* use for mmap system call */
+	__u32 handle;
+	__u32 pad;
+};
+
+#define VIRTGPU_EXECBUF_SYNCOBJ_RESET		0x01
+#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \
+		VIRTGPU_EXECBUF_SYNCOBJ_RESET | \
+		0)
+struct drm_virtgpu_execbuffer_syncobj {
+	__u32 handle;
+	__u32 flags;
+	__u64 point;
+};
+
+/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */
+struct drm_virtgpu_execbuffer {
+	__u32 flags;
+	__u32 size;
+	__u64 command; /* void* */
+	__u64 bo_handles;
+	__u32 num_bo_handles;
+	__s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */
+	__u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */
+	__u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */
+	__u32 num_in_syncobjs;
+	__u32 num_out_syncobjs;
+	__u64 in_syncobjs;
+	__u64 out_syncobjs;
+};
+
+#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
+#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */
+#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */
+#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */
+#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing  */
+#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */
+#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */
+#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */
+
+struct drm_virtgpu_getparam {
+	__u64 param;
+	__u64 value;
+};
+
+/* NO_BO flags? NO resource flag? */
+/* resource flag for y_0_top */
+struct drm_virtgpu_resource_create {
+	__u32 target;
+	__u32 format;
+	__u32 bind;
+	__u32 width;
+	__u32 height;
+	__u32 depth;
+	__u32 array_size;
+	__u32 last_level;
+	__u32 nr_samples;
+	__u32 flags;
+	__u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */
+	__u32 res_handle;  /* returned by kernel */
+	__u32 size;        /* validate transfer in the host */
+	__u32 stride;      /* validate transfer in the host */
+};
+
+struct drm_virtgpu_resource_info {
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u32 size;
+	__u32 blob_mem;
+};
+
+struct drm_virtgpu_3d_box {
+	__u32 x;
+	__u32 y;
+	__u32 z;
+	__u32 w;
+	__u32 h;
+	__u32 d;
+};
+
+struct drm_virtgpu_3d_transfer_to_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+struct drm_virtgpu_3d_transfer_from_host {
+	__u32 bo_handle;
+	struct drm_virtgpu_3d_box box;
+	__u32 level;
+	__u32 offset;
+	__u32 stride;
+	__u32 layer_stride;
+};
+
+#define VIRTGPU_WAIT_NOWAIT 1 /* like it */
+struct drm_virtgpu_3d_wait {
+	__u32 handle; /* 0 is an invalid handle */
+	__u32 flags;
+};
+
+#define VIRTGPU_DRM_CAPSET_VIRGL 1
+#define VIRTGPU_DRM_CAPSET_VIRGL2 2
+#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3
+#define VIRTGPU_DRM_CAPSET_VENUS 4
+#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5
+#define VIRTGPU_DRM_CAPSET_DRM 6
+struct drm_virtgpu_get_caps {
+	__u32 cap_set_id;
+	__u32 cap_set_ver;
+	__u64 addr;
+	__u32 size;
+	__u32 pad;
+};
+
+struct drm_virtgpu_resource_create_blob {
+#define VIRTGPU_BLOB_MEM_GUEST             0x0001
+#define VIRTGPU_BLOB_MEM_HOST3D            0x0002
+#define VIRTGPU_BLOB_MEM_HOST3D_GUEST      0x0003
+
+#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE     0x0001
+#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE    0x0002
+#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004
+	/* zero is invalid blob_mem */
+	__u32 blob_mem;
+	__u32 blob_flags;
+	__u32 bo_handle;
+	__u32 res_handle;
+	__u64 size;
+
+	/*
+	 * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and
+	 * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero.
+	 */
+	__u32 pad;
+	__u32 cmd_size;
+	__u64 cmd;
+	__u64 blob_id;
+};
+
+#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID       0x0001
+#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS       0x0002
+#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003
+#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME      0x0004
+struct drm_virtgpu_context_set_param {
+	__u64 param;
+	__u64 value;
+};
+
+struct drm_virtgpu_context_init {
+	__u32 num_params;
+	__u32 pad;
+
+	/* pointer to drm_virtgpu_context_set_param array */
+	__u64 ctx_set_params;
+};
+
+/*
+ * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in
+ * effect.  The event size is sizeof(drm_event), since there is no additional
+ * payload.
+ */
+#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000
+
+#define DRM_IOCTL_VIRTGPU_MAP \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
+
+#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
+		struct drm_virtgpu_execbuffer)
+
+#define DRM_IOCTL_VIRTGPU_GETPARAM \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\
+		struct drm_virtgpu_getparam)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE			\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE,	\
+		struct drm_virtgpu_resource_create)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \
+		 struct drm_virtgpu_resource_info)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST,	\
+		struct drm_virtgpu_3d_transfer_from_host)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST,	\
+		struct drm_virtgpu_3d_transfer_to_host)
+
+#define DRM_IOCTL_VIRTGPU_WAIT				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT,	\
+		struct drm_virtgpu_3d_wait)
+
+#define DRM_IOCTL_VIRTGPU_GET_CAPS \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \
+	struct drm_virtgpu_get_caps)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB,	\
+		struct drm_virtgpu_resource_create_blob)
+
+#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT					\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT,		\
+		struct drm_virtgpu_context_init)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
new file mode 100644
index 0000000000000..3ef774b8259d3
--- /dev/null
+++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2020 Chromium
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VENUS_HW_H
+#define VENUS_HW_H
+
+#include <stdint.h>
+
+struct virgl_renderer_capset_venus {
+   uint32_t wire_format_version;
+   uint32_t vk_xml_version;
+   uint32_t vk_ext_command_serialization_spec_version;
+   uint32_t vk_mesa_venus_protocol_spec_version;
+
+   /* This flag indicates render server config, and will be needed until drm
+    * virtio-gpu blob mem gets fixed to attach_resource before resource_map.
+    */
+   uint32_t supports_blob_id_0;
+
+   /* Extension number N, where N is defined by the Vulkan spec, corresponds
+    * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023
+    * Vulkan extensions (numbered from 1 to 1023).
+    *
+    * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When
+    * that bit is set, the extension mask(s) are valid. Otherwise, all the
+    * extensions are assumed to be supported by the renderer side protocol.
+    */
+   uint32_t vk_extension_mask1[32];
+
+   /* The single-threaded renderer cannot afford potential blocking calls. It
+    * also leads to GPU lost if the wait depends on a following command. This
+    * capset allows such blocking calls to passthrough from the clients, and
+    * shifts the responsibilities to the client drivers.
+    */
+   uint32_t allow_vk_wait_syncs;
+
+   /* This flag indicates that the renderer supports multiple fencing
+    * timelines. The client driver is expected to associate each VkQueue with
+    * one of these timelines at queue creation by binding it with an unused
+    * ring_idx. Queues created without a ring_idx binding are associated to a
+    * shared legacy timeline. The special ring_idx==0 is reserved for CPU
+    * fences that are signaled by the renderer immediately upon consumption of
+    * the associated renderer submission.
+    */
+   uint32_t supports_multiple_timelines;
+
+   /* This flag indicates to the guest that hypervisor does not support memory
+    * pages injections and blob allocations must be done by guest from the
+    * dedicated heap (Host visible memory).
+    */
+   uint32_t use_guest_vram;
+};
+
+#endif /* VENUS_HW_H */
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
index a7ed708851d8f..26510b20bc479 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h
@@ -1,8 +1,8 @@
 #include "ggml-backend-impl.h"
 #include "ggml-remoting.h"
 #include "virtgpu.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h"
+#include "../ggml-remotingbackend/shared/apir_backend.h"
+#include "../ggml-remotingbackend/shared/venus_cs_ggml.h"
 
 #define CACHED
 //  printf("INFO: ### found response in the cache %s\n", __func__)o
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
index bbe94f14300ef..cc159e071e218 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h
@@ -4,7 +4,7 @@
 
 #include "virtgpu-utils.h"
 
-#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h"
+#include "../ggml-remotingbackend/shared/apir_backend.h"
 
 /* device */
 int apir_device_get_count(struct virtgpu *gpu);

From 38b13110e7c7319f296aa03b66cedceefc62e4b9 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Wed, 11 Jun 2025 14:30:28 +0200
Subject: [PATCH 110/117] update the custom scripts

---
 build-xcframework.sh | 526 -------------------------------------------
 podman_compile.sh    |   5 +-
 prepare.remoting.sh  |   2 +
 run.remoting.sh      |  39 ++--
 4 files changed, 22 insertions(+), 550 deletions(-)
 delete mode 100755 build-xcframework.sh

diff --git a/build-xcframework.sh b/build-xcframework.sh
deleted file mode 100755
index 1b9091d288cc8..0000000000000
--- a/build-xcframework.sh
+++ /dev/null
@@ -1,526 +0,0 @@
-#!/bin/bash
-#
-# Options
-IOS_MIN_OS_VERSION=16.4
-MACOS_MIN_OS_VERSION=13.3
-VISIONOS_MIN_OS_VERSION=1.0
-TVOS_MIN_OS_VERSION=16.4
-
-BUILD_SHARED_LIBS=OFF
-LLAMA_BUILD_EXAMPLES=OFF
-LLAMA_BUILD_TESTS=OFF
-LLAMA_BUILD_SERVER=OFF
-GGML_METAL=ON
-GGML_METAL_EMBED_LIBRARY=ON
-GGML_BLAS_DEFAULT=ON
-GGML_METAL_USE_BF16=ON
-GGML_OPENMP=OFF
-
-COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
-
-# Common options for all builds
-COMMON_CMAKE_ARGS=(
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
-    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
-    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
-    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
-    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
-    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
-    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
-    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
-    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
-    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
-    -DGGML_METAL=${GGML_METAL}
-    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
-    -DGGML_NATIVE=OFF
-    -DGGML_OPENMP=${GGML_OPENMP}
-)
-
-check_required_tool() {
-    local tool=$1
-    local install_message=$2
-
-    if ! command -v $tool &> /dev/null; then
-        echo "Error: $tool is required but not found."
-        echo "$install_message"
-        exit 1
-    fi
-}
-echo "Checking for required tools..."
-check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
-check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
-check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
-
-set -e
-
-## Clean up previous builds
-rm -rf build-apple
-rm -rf build-ios-sim
-rm -rf build-ios-device
-rm -rf build-macos
-rm -rf build-visionos
-rm -rf build-visionos-sim
-rm -rf build-tvos-sim
-rm -rf build-tvos-device
-
-# Setup the xcframework build directory structure
-setup_framework_structure() {
-    local build_dir=$1
-    local min_os_version=$2
-    local platform=$3  # "ios", "macos", "visionos", or "tvos"
-    local framework_name="llama"
-
-    echo "Creating ${platform}-style framework structure for ${build_dir}"
-
-    if [[ "$platform" == "macos" ]]; then
-        # macOS versioned structure uses versioned directories
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
-
-        # Create symbolic links
-        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
-        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
-        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
-        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
-        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
-    else
-        # iOS/VisionOS/tvOS use a flat structure
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
-        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
-
-        # Remove any existing structure to ensure clean build
-        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
-
-        # Set header and module paths
-        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
-        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
-    fi
-
-    # Copy all required headers (common for all platforms)
-    cp include/llama.h             ${header_path}
-    cp ggml/include/ggml.h         ${header_path}
-    cp ggml/include/ggml-alloc.h   ${header_path}
-    cp ggml/include/ggml-backend.h ${header_path}
-    cp ggml/include/ggml-metal.h   ${header_path}
-    cp ggml/include/ggml-cpu.h     ${header_path}
-    cp ggml/include/ggml-blas.h    ${header_path}
-    cp ggml/include/gguf.h         ${header_path}
-
-    # Create module map (common for all platforms)
-    cat > ${module_path}module.modulemap << EOF
-framework module llama {
-    header "llama.h"
-    header "ggml.h"
-    header "ggml-alloc.h"
-    header "ggml-backend.h"
-    header "ggml-metal.h"
-    header "ggml-cpu.h"
-    header "ggml-blas.h"
-    header "gguf.h"
-
-    link "c++"
-    link framework "Accelerate"
-    link framework "Metal"
-    link framework "Foundation"
-
-    export *
-}
-EOF
-
-    # Platform-specific settings for Info.plist
-    local platform_name=""
-    local sdk_name=""
-    local supported_platform=""
-
-    case "$platform" in
-        "ios")
-            platform_name="iphoneos"
-            sdk_name="iphoneos${min_os_version}"
-            supported_platform="iPhoneOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>1</integer>
-        <integer>2</integer>
-    </array>'
-            ;;
-        "macos")
-            platform_name="macosx"
-            sdk_name="macosx${min_os_version}"
-            supported_platform="MacOSX"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
-            local device_family=""
-            ;;
-        "visionos")
-            platform_name="xros"
-            sdk_name="xros${min_os_version}"
-            supported_platform="XRPlatform"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family=""
-            ;;
-        "tvos")
-            platform_name="appletvos"
-            sdk_name="appletvos${min_os_version}"
-            supported_platform="AppleTVOS"
-            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
-            local device_family='    <key>UIDeviceFamily</key>
-    <array>
-        <integer>3</integer>
-    </array>'
-            ;;
-    esac
-
-    # Create Info.plist
-    cat > ${plist_path} << EOF
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>CFBundleDevelopmentRegion</key>
-    <string>en</string>
-    <key>CFBundleExecutable</key>
-    <string>llama</string>
-    <key>CFBundleIdentifier</key>
-    <string>org.ggml.llama</string>
-    <key>CFBundleInfoDictionaryVersion</key>
-    <string>6.0</string>
-    <key>CFBundleName</key>
-    <string>llama</string>
-    <key>CFBundlePackageType</key>
-    <string>FMWK</string>
-    <key>CFBundleShortVersionString</key>
-    <string>1.0</string>
-    <key>CFBundleVersion</key>
-    <string>1</string>
-    <key>MinimumOSVersion</key>
-    <string>${min_os_version}</string>
-    <key>CFBundleSupportedPlatforms</key>
-    <array>
-        <string>${supported_platform}</string>
-    </array>${device_family}
-    <key>DTPlatformName</key>
-    <string>${platform_name}</string>
-    <key>DTSDKName</key>
-    <string>${sdk_name}</string>
-</dict>
-</plist>
-EOF
-}
-
-# Create dynamic libraries from static libraries.
-combine_static_libraries() {
-    local build_dir="$1"
-    local release_dir="$2"
-    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
-    local is_simulator="$4"
-    local base_dir="$(pwd)"
-    local framework_name="llama"
-
-    # Determine output path based on platform
-    local output_lib=""
-    if [[ "$platform" == "macos" ]]; then
-        # macOS uses versioned structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
-    else
-        # iOS, visionOS, and tvOS use a directory flat structure
-        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
-    fi
-
-    local libs=(
-        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
-        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
-        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
-    )
-
-    # Create temporary directory for processing
-    local temp_dir="${base_dir}/${build_dir}/temp"
-    mkdir -p "${temp_dir}"
-
-    # Since we have multiple architectures libtool will find object files that do not
-    # match the target architecture. We suppress these warnings.
-    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
-
-    # Determine SDK, architectures, and install_name based on platform and simulator flag.
-    local sdk=""
-    local archs=""
-    local min_version_flag=""
-    local install_name=""
-
-    case "$platform" in
-        "ios")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="iphonesimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
-            else
-                sdk="iphoneos"
-                archs="arm64"
-                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "macos")
-            sdk="macosx"
-            archs="arm64 x86_64"
-            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
-            install_name="@rpath/llama.framework/Versions/Current/llama"
-            ;;
-        "visionos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="xrsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
-            else
-                sdk="xros"
-                archs="arm64"
-                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
-            fi
-            # Use flat structure for visionOS, same as iOS
-            install_name="@rpath/llama.framework/llama"
-            ;;
-        "tvos")
-            if [[ "$is_simulator" == "true" ]]; then
-                sdk="appletvsimulator"
-                archs="arm64 x86_64"
-                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
-            else
-                sdk="appletvos"
-                archs="arm64"
-                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
-            fi
-            install_name="@rpath/llama.framework/llama"
-            ;;
-    esac
-
-    # Build architecture flags
-    local arch_flags=""
-    for arch in $archs; do
-        arch_flags+=" -arch $arch"
-    done
-
-    # Create dynamic library
-    echo "Creating dynamic library for ${platform}."
-    xcrun -sdk $sdk clang++ -dynamiclib \
-        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
-        $arch_flags \
-        $min_version_flag \
-        -Wl,-force_load,"${temp_dir}/combined.a" \
-        -framework Foundation -framework Metal -framework Accelerate \
-        -install_name "$install_name" \
-        -o "${base_dir}/${output_lib}"
-
-    # Platform-specific post-processing for device builds
-    if [[ "$is_simulator" == "false" ]]; then
-        if command -v vtool &>/dev/null; then
-            case "$platform" in
-                "ios")
-                    echo "Marking binary as a framework binary for iOS..."
-                    vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "visionos")
-                    echo "Marking binary as a framework binary for visionOS..."
-                    vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-                "tvos")
-                    echo "Marking binary as a framework binary for tvOS..."
-                    vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
-                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
-                    ;;
-            esac
-        else
-            echo "Warning: vtool not found. Binary may not pass App Store validation."
-        fi
-    fi
-
-    echo "Creating properly formatted dSYM..."
-    # Create a separate directory for dSYMs for all platforms
-    mkdir -p "${base_dir}/${build_dir}/dSYMs"
-
-    # iOS and visionOS style dSYM (flat structure)
-    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Create a copy of the binary that will be stripped
-        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
-
-        # Strip debug symbols from the copy
-        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
-
-        # Replace the original with the stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    else
-        # macOS style dSYM
-        # First strip debug info to a separate file
-        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
-
-        # Generate dSYM in the dSYMs directory
-        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
-
-        # Replace original binary with stripped version
-        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
-    fi
-
-    # Remove any automatically generated dSYM files in the framework structure as they will
-    # otherwise case Invalid Bundle Structure validation errors.
-    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
-        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
-        rm -rf "${base_dir}/${output_lib}.dSYM"
-    fi
-
-    # Clean up
-    rm -rf "${temp_dir}"
-}
-
-echo "Building for iOS simulator..."
-cmake -B build-ios-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DIOS=ON \
-    -DCMAKE_SYSTEM_NAME=iOS \
-    -DCMAKE_OSX_SYSROOT=iphonesimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-sim --config Release -- -quiet
-
-echo "Building for iOS devices..."
-cmake -B build-ios-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_SYSROOT=iphoneos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-ios-device --config Release -- -quiet
-
-echo "Building for macOS..."
-cmake -B build-macos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-macos --config Release -- -quiet
-
-echo "Building for visionOS..."
-cmake -B build-visionos -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xros \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos --config Release -- -quiet
-
-echo "Building for visionOS simulator..."
-cmake -B build-visionos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DCMAKE_SYSTEM_NAME=visionOS \
-    -DCMAKE_OSX_SYSROOT=xrsimulator \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-visionos-sim --config Release -- -quiet
-
-# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
-echo "Building for tvOS simulator..."
-cmake -B build-tvos-sim -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvsimulator \
-    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-sim --config Release -- -quiet
-
-echo "Building for tvOS devices..."
-cmake -B build-tvos-device -G Xcode \
-    "${COMMON_CMAKE_ARGS[@]}" \
-    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=tvOS \
-    -DCMAKE_OSX_SYSROOT=appletvos \
-    -DCMAKE_OSX_ARCHITECTURES="arm64" \
-    -DGGML_METAL=ON \
-    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
-    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-    -DLLAMA_CURL=OFF \
-    -S .
-cmake --build build-tvos-device --config Release -- -quiet
-
-# Setup frameworks and copy binaries and headers
-echo "Setting up framework structures..."
-setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
-setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
-setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
-setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
-setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
-
-# Create dynamic libraries from static libraries
-echo "Creating dynamic libraries from static libraries..."
-combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
-combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
-combine_static_libraries "build-macos" "Release" "macos" "false"
-combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
-combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
-combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
-combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
-
-# Create XCFramework with correct debug symbols paths
-echo "Creating XCFramework..."
-xcodebuild -create-xcframework \
-    -framework $(pwd)/build-ios-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-ios-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-macos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
-    -framework $(pwd)/build-visionos/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-device/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
-    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
-    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
-    -output $(pwd)/build-apple/llama.xcframework
diff --git a/podman_compile.sh b/podman_compile.sh
index ec243f75ee89f..de9e5c88d57a7 100755
--- a/podman_compile.sh
+++ b/podman_compile.sh
@@ -10,7 +10,7 @@ opts=""
 opts="$opts --device /dev/dri "
 echo "Running with the GPU passthrough"
 
-image=localhost/pytorch:remoting
+IMAGE=quay.io/ramalama/remoting:latest
 
 what=${1:-}
 if [[ -z "$what" ]]; then
@@ -30,9 +30,10 @@ podman run \
 --security-opt label=disable \
 --env HOME="$HOME" \
 --env PERF_MODE="${PERF_MODE:-}" \
+--env BENCH_MODE="${BENCH_MODE:-}" \
 -v "$HOME":"$HOME":Z \
 -w "$PWD" \
 -it --rm \
 $opts \
-$image \
+$IMAGE \
 $cmd
diff --git a/prepare.remoting.sh b/prepare.remoting.sh
index aebb75c031422..5ab73470477b1 100755
--- a/prepare.remoting.sh
+++ b/prepare.remoting.sh
@@ -2,5 +2,7 @@ cmake -S . -B ../build.remoting-frontend \
       -DGGML_REMOTINGFRONTEND=ON \
       -DGGML_CPU_ARM_ARCH=native \
       -DGGML_NATIVE=OFF \
+      -DGGML_OPENMP=OFF \
+      -DLLAMA_CURL=OFF \
       -DCMAKE_BUILD_TYPE=Debug \
       "$@"
diff --git a/run.remoting.sh b/run.remoting.sh
index 9a8ce4d34c74a..017f3fe58c9ff 100755
--- a/run.remoting.sh
+++ b/run.remoting.sh
@@ -8,39 +8,34 @@ else
     prefix=""
 fi
 
-if [[ "${PERF_MODE:-}" ]]; then
-    FLAVOR="-prod"
-else
-    FLAVOR=""
-fi
-
 MODEL=${MODEL:-llama3.2}
 
-if [[ "$FLAVOR" == "-prod" ]]; then
-    cat <<EOF
-###
-### Running the prod flavor
-###
-
-EOF
-fi
-
-if [[ "${BENCH_MODE:-}" ]]; then
-    bench=yes
-else
-    bench=no
-fi
-
 LLAMA_BUILD_DIR=../build.remoting-frontend$FLAVOR
 
 MODEL_HOME="$HOME/models"
 
 set -x
-if [[ "$bench" == yes ]]; then
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    cat <<EOF
+###
+### Running llama-bench
+###
+
+EOF
     $prefix \
         $LLAMA_BUILD_DIR/bin/llama-bench \
         --model "$MODEL_HOME/$MODEL" \
         --n-gpu-layers 99
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    cat <<EOF
+###
+### Running test-backend-ops perf
+###
+
+EOF
+    $prefix \
+        $LLAMA_BUILD_DIR/bin/test-backend-ops perf
+
 else
     PROMPT="say nothing"
     #PROMPT="tell what's Apple metal API"

From 5c93fb32f60e9ead57f8e87cc9da089c93c1c082 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Tue, 17 Jun 2025 17:40:56 +0200
Subject: [PATCH 111/117] ggml: src: ggml-remotingfrontend/virtgpu-shm: import
 the cpp atomic

---
 ggml/src/ggml-remotingfrontend/virtgpu-shm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
index e5770b1916886..52217f5b7e857 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h
@@ -3,7 +3,7 @@
 #include <cassert>
 #include <cstdint>
 #include <cstddef>
-#include <stdatomic.h>
+#include <atomic>
 #include <sys/mman.h>
 
 #include "virtgpu.h"

From 65b92b9ad6c642eed6fc6e9e3cb8059cc435d018 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 15:14:57 +0200
Subject: [PATCH 112/117] remoting: reintroduce the support for
 support_op(tensor)

---
 .../backend-dispatched-device.cpp             |  2 +-
 .../shared/apir_backend.h                     |  3 +
 .../shared/venus_cs_ggml.h                    | 69 +++++++++++++++++++
 .../ggml-backend-device.cpp                   |  7 --
 .../src/ggml-remotingfrontend/ggml-remoting.h |  4 ++
 .../venus_cs_ggml-rpc-front.cpp               |  9 ++-
 .../virtgpu-forward-device.cpp                |  4 +-
 7 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
index 5bf0788ccf864..473e9d2db7089 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -73,7 +73,7 @@ uint32_t
 backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
 
-  const ggml_tensor *op = vn_decode_ggml_tensor(dec);
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
 
   bool supports_op = dev->iface.supports_op(dev, op);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 4146908813c6d..8125a30e386e4 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -9,6 +9,9 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
+// 1 is fast, 0 avoid micro-benchmark crashes
+#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0
+
 typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
index 71e15f847e851..71c9b3f3ed820 100644
--- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
+++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -165,3 +165,72 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) {
 
   return deserialize_graph(n_nodes, n_tensors, tensors, nodes);
 }
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void
+vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  size_t tensor_size = sizeof(*tensor);
+
+  if (tensor->extra) {
+    FATAL("Cannot pass tensors with extra");
+  }
+
+  if (tensor->src[0] && tensor->buffer) {
+    static int first = 1;
+    if (first) {
+      // not sure if the buffer needs to be updated inside the src tensors or not
+      WARNING("Cannot pass tensors with src and buffer");
+      first = 0;
+    }
+  }
+
+  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+  // (could also make a copy of the tensor, and update locally.)
+
+  if (tensor->buffer) {
+    apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+  }
+
+  if (tensor->view_src) {
+    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    const ggml_tensor *tensor_src = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+  }
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+  if (tensor->buffer) {
+    tensor->buffer = vn_decode_ggml_buffer(dec);
+  }
+
+  if (tensor->view_src) {
+    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->view_src = tensor_view_src;
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+  }
+
+  return tensor;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index dfe1e992c9dac..1fa661e3b60d6 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -38,16 +38,9 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
 
 static bool
 ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-#if 1
-  UNUSED(dev);
-  UNUSED(op);
-
-  return true; // same as ggml-rpc
-#else
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_supports_op(gpu, op);
-#endif
 }
 
 static bool
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
index 18b880c740564..cd58ed674475d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h
+++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -126,3 +126,7 @@ struct remoting_context_struct {
 };
 typedef std::shared_ptr<remoting_context_struct> remoting_context;
 typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  return BUFFER_TO_HOST_HANDLE(buffer);
+}
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
index 67b8c37748aa8..53c42730fad06 100644
--- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -40,8 +40,13 @@ serialize_tensor(const ggml_tensor * tensor) {
   result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
   result.view_offs = tensor->view_offs;
   result.data = reinterpret_cast<uint64_t>(tensor->data);
-  // tensor->data is serialized as an offset to the buffer base address
-  result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  if (tensor->data) {
+    if (!tensor->buffer) {
+      FATAL("tensor has data but not buffer :/");
+    }
+    // tensor->data is serialized as an offset to the buffer base address
+    result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  }
   snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
   return result;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
index 06ad6d445de4c..ca036366a6752 100644
--- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -135,7 +135,7 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
 
 bool
 apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
-#if 1
+#if APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE
   /* ggml-rpc cheats it like this */
   /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
   UNUSED(gpu);
@@ -147,7 +147,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
   struct vn_cs_decoder *decoder;
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
 
-  vn_encode_ggml_tensor(encoder, op);
+  vn_encode_ggml_tensor_inline(encoder, op);
 
   REMOTE_CALL(gpu, encoder, decoder);
 

From 34e68b5df1e073494796efc3542a56d72b186520 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 15:15:39 +0200
Subject: [PATCH 113/117] remotingbackend: add an optional call to support_op
 to avoid crashing the backend if the tensor is not supported

---
 .../backend-dispatched-backend.cpp                | 15 +++++++++++++++
 .../ggml-remotingbackend/shared/apir_backend.h    |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
index 6e600843a48db..f15f39c7f92d8 100644
--- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
+++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp
@@ -32,6 +32,21 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru
   ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size);
 
   ggml_status status;
+#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1
+  for (int idx = 0; idx < cgraph->n_nodes; idx++) {
+    ggml_tensor *op = ggml_graph_node(cgraph, idx);
+    if (dev->iface.supports_op(dev, op)) {
+      continue;
+    }
+    ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op));
+
+    status = GGML_STATUS_ABORTED;
+    vn_encode_ggml_status(enc, &status);
+
+    stop_timer(&graph_compute_timer);
+    return 0;
+  }
+#endif
   status = bck->iface.graph_compute(bck, cgraph);
 
   vn_encode_ggml_status(enc, &status);
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 8125a30e386e4..6d44108ef7a61 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -12,6 +12,9 @@
 // 1 is fast, 0 avoid micro-benchmark crashes
 #define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0
 
+// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
+#define APIR_BACKEND_CHECK_SUPPORTS_OP 0
+
 typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
 

From 1d4bbef12db5be71b6ac82203a582b99a539b7dd Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 15:16:19 +0200
Subject: [PATCH 114/117] remotingfrontend: reduce and cleanup the logging

---
 .../ggml-remotingfrontend/ggml-backend-buffer-type.cpp | 10 +++-------
 ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp |  3 +--
 .../ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp  |  4 ----
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index 70fc829c24fa4..eb4e3b2940721 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -5,7 +5,7 @@
 
 static ggml_backend_buffer_t
 ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
   struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context));
@@ -29,9 +29,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
   context->is_host_buffer = false;
 
   ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size);
-  INFO("##");
-  INFO("## %s(%llx) --> %p <---------------", __func__, size, buffer);
-  INFO("##\n");
 
   return buffer;
 }
@@ -47,8 +44,7 @@ ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
 
 static size_t
 ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-  IMPLEMENTED;
-
+  IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
   return apir_buffer_type_get_alignment(gpu, buft);
@@ -56,7 +52,7 @@ ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft)
 
 static size_t
 ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-  IMPLEMENTED;
+  IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
   return apir_buffer_type_get_max_size(gpu, buft);
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index 1fa661e3b60d6..b17b43cd8d55f 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -20,8 +20,7 @@ ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) {
 
 static enum ggml_backend_dev_type
 ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
-  IMPLEMENTED;
-
+  IMPLEMENTED_ONCE;
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return (enum ggml_backend_dev_type) apir_device_get_type(gpu);
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
index 53c42730fad06..7ce0dbb7fbc67 100644
--- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
+++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -18,10 +18,6 @@ serialize_tensor(const ggml_tensor * tensor) {
     ggml_backend_buffer_t buffer = tensor->buffer;
 
     result.buffer = BUFFER_TO_HOST_HANDLE(buffer);
-    if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) {
-      INFO("pass buffer handle %p", result.buffer);
-      BREAKPOINT;
-    }
   } else {
     result.buffer = 0;
   }

From 67d00e7b6f60220341ec841456571f2fe65424de Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 15:18:11 +0200
Subject: [PATCH 115/117] remotingfrontend: cache some values

---
 .../ggml-backend-buffer-type.cpp                  | 15 +++++++++++++--
 .../ggml-remotingfrontend/ggml-backend-device.cpp |  9 ++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
index eb4e3b2940721..b655b8018f80d 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp
@@ -47,7 +47,13 @@ ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft)
   IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return apir_buffer_type_get_alignment(gpu, buft);
+  static size_t align = 0;
+
+  if (align == 0) {
+    align = apir_buffer_type_get_alignment(gpu, buft);
+  }
+
+  return align;
 }
 
 static size_t
@@ -55,7 +61,12 @@ ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft)
   IMPLEMENTED_ONCE;
   struct virtgpu *gpu = BUFT_TO_GPU(buft);
 
-  return apir_buffer_type_get_max_size(gpu, buft);
+  static size_t max_size = 0;
+  if (max_size == 0) {
+    max_size = apir_buffer_type_get_max_size(gpu, buft);
+  }
+
+  return max_size;
 }
 
 static bool
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
index b17b43cd8d55f..6f498d0edc2e4 100644
--- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
+++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -23,7 +23,14 @@ ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) {
   IMPLEMENTED_ONCE;
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
-  return (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+  static enum ggml_backend_dev_type type;
+  static bool has_type = false;
+  if (!has_type) {
+    has_type = true;
+    type = (enum ggml_backend_dev_type) apir_device_get_type(gpu);
+  }
+
+  return type;
 }
 
 static void

From a6186a1c86fd72cad460648ed6c924e1694717c5 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 17:33:03 +0200
Subject: [PATCH 116/117] Update the custom scripts

---
 build.backend.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/build.backend.sh b/build.backend.sh
index dc0b6007e3123..2904c4a15c73f 100755
--- a/build.backend.sh
+++ b/build.backend.sh
@@ -20,8 +20,14 @@ if [[ "$FLAVOR" == "-prod" ]]; then
 EOF
 fi
 
-WHAT="llama-run llama-bench"
-cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $WHAT "$@"
+TARGETS="llama-run"
+if [[ "${BENCH_MODE:-}" == "bench" ]]; then
+    TARGETS="$TARGETS llama-bench"
+elif [[ "${BENCH_MODE:-}" == "perf" ]]; then
+    TARGETS="$TARGETS test-backend-ops"
+fi
+
+cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $TARGETS "$@"
 
 if [[ $? == 0 ]]; then
     touch READY_backend

From 61a6bdd4ae64e4402c4d2ca4a035636c3b4ab928 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Thu, 19 Jun 2025 17:33:27 +0200
Subject: [PATCH 117/117] remotingbackend: set
 APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE = 1

---
 ggml/src/ggml-remotingbackend/shared/apir_backend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
index 6d44108ef7a61..80e5961ff04b5 100644
--- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h
+++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -10,7 +10,7 @@
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
 // 1 is fast, 0 avoid micro-benchmark crashes
-#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0
+#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 1
 
 // 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received
 #define APIR_BACKEND_CHECK_SUPPORTS_OP 0