From 74eaeb89e515df5919392373273be1e24db90abb Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 9 Apr 2025 13:25:54 +0200 Subject: [PATCH 001/117] src: reduce the logging --- src/llama-kv-cache.cpp | 2 ++ src/llama-model-loader.cpp | 2 +- src/llama-vocab.cpp | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index dbf5f1187d9e5..04d593ce21477 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -86,8 +86,10 @@ bool llama_kv_cache_unified::init( buft = ggml_backend_cpu_buffer_type(); } + /* LLAMA_LOG_DEBUG("%s: layer %3d: n_embd_k_gqa = %d, n_embd_v_gqa = %d, dev = %s\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa, dev_name); + */ ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ea73a8a7ba944..36f8d1cbf0323 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -668,7 +668,7 @@ llama_model_loader::llama_model_loader( } replace_all(value, "\n", "\\n"); - LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + //LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 0feabd95aaf2b..a9c24e78812ac 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1974,8 +1974,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else { // token is control, but not marked as EOG -> print a debug log if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) { - LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", - __func__, t.second, t.first.c_str()); + //LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n", + // __func__, t.second, t.first.c_str()); } } } From 41846f348b920d825a93a8e00316165b7c07d685 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 9 Apr 2025 13:26:56 +0200 Subject: [PATCH 002/117] Add helper scripts --- prepare.remoting.sh | 6 ++++++ prepare.sh | 1 + prepare.vulkan.sh | 1 + run.remoting.sh | 16 ++++++++++++++++ run.sh | 1 + run.vulkan.sh | 1 + 6 files changed, 26 insertions(+) create mode 100755 prepare.remoting.sh create mode 100644 prepare.sh create mode 100644 prepare.vulkan.sh create mode 100755 run.remoting.sh create mode 100755 run.sh create mode 100755 run.vulkan.sh diff --git a/prepare.remoting.sh b/prepare.remoting.sh new file mode 100755 index 0000000000000..aebb75c031422 --- /dev/null +++ b/prepare.remoting.sh @@ -0,0 +1,6 @@ +cmake -S . -B ../build.remoting-frontend \ + -DGGML_REMOTINGFRONTEND=ON \ + -DGGML_CPU_ARM_ARCH=native \ + -DGGML_NATIVE=OFF \ + -DCMAKE_BUILD_TYPE=Debug \ + "$@" diff --git a/prepare.sh b/prepare.sh new file mode 100644 index 0000000000000..2fb46cefd426c --- /dev/null +++ b/prepare.sh @@ -0,0 +1 @@ +cmake -S . -B ./build -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF #-DCMAKE_BUILD_TYPE=Debug #-DGGML_VULKAN_DEBUG=1 diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh new file mode 100644 index 0000000000000..29d0794ebe4e3 --- /dev/null +++ b/prepare.vulkan.sh @@ -0,0 +1 @@ +cmake -S . -B ../build.vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF diff --git a/run.remoting.sh b/run.remoting.sh new file mode 100755 index 0000000000000..c6fbdaac435a5 --- /dev/null +++ b/run.remoting.sh @@ -0,0 +1,16 @@ +#! /bin/bash + +if [[ ${1:-} == "gdb" ]]; then + prefix="gdb --args" +else + prefix="" +fi + +MODEL="$HOME/models/llama3.2" +PROMPT="say nothing" +$prefix \ + ../build.remoting-frontend/bin/llama-run \ + --ngl 99 \ + --verbose \ + "$MODEL" \ + "$PROMPT" diff --git a/run.sh b/run.sh new file mode 100755 index 0000000000000..13d8c042515f0 --- /dev/null +++ b/run.sh @@ -0,0 +1 @@ +./build/bin/llama-run --ngl 999 --verbose ~/models/llama3.2 "say nothing" diff --git a/run.vulkan.sh b/run.vulkan.sh new file mode 100755 index 0000000000000..7f44334290bbf --- /dev/null +++ b/run.vulkan.sh @@ -0,0 +1 @@ +../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing" From ee79e12d0a40ef5f5a7e714248746f4c42ea475b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 9 Apr 2025 13:26:46 +0200 Subject: [PATCH 003/117] build-system: integrate the Remoting Frontend backend build --- CMakePresets.json | 1 + Makefile | 8 ++++++++ ggml/CMakeLists.txt | 2 ++ ggml/src/CMakeLists.txt | 1 + 4 files changed, 12 insertions(+) diff --git a/CMakePresets.json b/CMakePresets.json index 13bdd7907ab40..c5369a47f6bf9 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -30,6 +30,7 @@ { "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } }, { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, + { "name": "remoting_frontend", "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND": "ON" } }, { "name": "x64-windows-llvm", "hidden": true, diff --git a/Makefile b/Makefile index 1f9455eff0aec..ebf9f79ed5598 100644 --- a/Makefile +++ b/Makefile @@ -716,6 +716,11 @@ ggml/src/ggml-cuda/ggml-cuda.o: \ $(NVCC_COMPILE) endif # GGML_CUDA +ifdef GGML_REMOTING_FRONTEND + MK_CPPFLAGS += -DGGML_USE_REMOTINGFRONTEND + OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o +endif + ifdef GGML_VULKAN MK_CPPFLAGS += -DGGML_USE_VULKAN MK_LDFLAGS += $(shell pkg-config --libs vulkan) @@ -755,6 +760,9 @@ _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp) ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source) $(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@ +ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(_ggml_vk_header): $(_ggml_vk_source) $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index d33f843b417cf..24c47aea122a2 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -179,6 +179,7 @@ option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug in option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) +option(GGML_REMOTING_FRONTEND "ggml: use the API Remoting frontend" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) @@ -269,6 +270,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-remoting-frontend.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f00700da71fcd..76c3f3d27fc16 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -309,6 +309,7 @@ ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) +ggml_add_backend(RemotingFrontend) ggml_add_backend(OpenCL) foreach (target ggml-base ggml) From 1bedad3cc38c5481b20a280f14c56565cdef677d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 9 Apr 2025 13:27:51 +0200 Subject: [PATCH 004/117] ggml: ggml-remotingfrontend: stubs of a new backend --- ggml/include/ggml-remoting-frontend.h | 16 + ggml/src/ggml-backend-reg.cpp | 8 + ggml/src/ggml-remotingfrontend/CMakeLists.txt | 20 + .../ggml-remoting-frontend.cpp | 499 ++++++++++++++++++ 4 files changed, 543 insertions(+) create mode 100644 ggml/include/ggml-remoting-frontend.h create mode 100644 ggml/src/ggml-remotingfrontend/CMakeLists.txt create mode 100644 ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h new file mode 100644 index 0000000000000..c32c283820dea --- /dev/null +++ b/ggml/include/ggml-remoting-frontend.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_REMOTING_NAME "RemotingFrontend" + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_reg(); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..8ed3c36362bcd 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -45,6 +45,10 @@ #include "ggml-vulkan.h" #endif +#ifdef GGML_USE_REMOTINGFRONTEND +#include "ggml-remoting-frontend.h" +#endif + #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -172,6 +176,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_VULKAN register_backend(ggml_backend_vk_reg()); #endif +#ifdef GGML_USE_REMOTINGFRONTEND + register_backend(ggml_backend_remoting_reg()); +#endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif @@ -575,6 +582,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("remoting_frontend", silent, dir_path); ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt new file mode 100644 index 0000000000000..4ab2aaa0ac340 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +# function(detect_host_compiler) +# find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH) +# find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH) + +# set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE) +# set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE) +# endfunction() + +message(STATUS "Enable API Remoting frontend found") + +ggml_add_backend_library(ggml-remotingfrontend + ggml-remoting-frontend.cpp + ../../include/ggml-remoting-frontend.h + ) + +#target_link_libraries(ggml-remotingfrontend PRIVATE remotingfrontend) +target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp new file mode 100644 index 0000000000000..4c7c1f1dc8f95 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp @@ -0,0 +1,499 @@ +#include "ggml-remoting-frontend.h" + +#include +#include +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + +#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl + +#define UNUSED GGML_UNUSED + +int ggml_backend_remoting_get_device_count(); +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); + +static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + + +struct ggml_backend_remoting_buffer_type_context { + std::string name; +}; + +struct remoting_context_struct { + int i; +}; +typedef std::shared_ptr remoting_context; +typedef std::weak_ptr remoting_context_ref; + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return ggml_backend_remoting_get_device_count(); +} + +static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_REMOTING_NAME; +} + +struct ggml_backend_remoting_device_context { + size_t device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + UNUSED(dev); + return "API Remoting"; +} + +static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + UNUSED(dev); + return "API Remoting device"; +} + +static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + UNUSED(device); + *total = 1024*1024*1024; + *free = *total; +} + +struct remoting_device_struct { + std::mutex mutex; +}; + +struct remoting_device_struct; +typedef std::shared_ptr remoting_device; +typedef std::weak_ptr remoting_device_ref; + +struct remoting_buffer_struct; +typedef std::shared_ptr remoting_buffer; +typedef std::weak_ptr remoting_buffer_ref; + +// vk buffer type +static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + return "Remoting buffer"; +} + +static void ggml_remoting_destroy_buffer(remoting_buffer& buf) { + UNUSED(buf); +} + + +static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + UNUSED(dst); + UNUSED(c); + UNUSED(size); + UNUSED(offset); +} + +static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + UNUSED(ctx); + UNUSED(dst); + UNUSED(c); + UNUSED(size); + UNUSED(offset); +} + + +static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; +} + +struct ggml_backend_remoting_buffer_context { + remoting_device_ref device; + remoting_buffer dev_buffer; + std::string name; + + ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : + name(name) { + UNUSED(device); + UNUSED(dev_buffer); + } + + ~ggml_backend_remoting_buffer_context() { + ggml_remoting_destroy_buffer(dev_buffer); + } +}; + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + ggml_remoting_destroy_buffer(ctx->dev_buffer); + delete ctx; +} + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *) 4096; + + UNUSED(buffer); +} + +static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + } + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + UNUSED(buffer); + UNUSED(tensor); + UNUSED(value); + UNUSED(offset); + UNUSED(size); +} + +static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) { + UNUSED(dst); + UNUSED(offset); + UNUSED(src); + UNUSED(size); +} + +static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) { + UNUSED(src); + UNUSED(offset); + UNUSED(dst); + UNUSED(size); +} + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +#if 0 + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + remoting_buffer buf = buf_ctx->dev_buffer; + + ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +#else + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); +#endif +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +#if 0 + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + + remoting_buffer buf = buf_ctx->dev_buffer; + + ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +#else + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); +#endif +} + +static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) { + UNUSED(ctx); + UNUSED(dst); + UNUSED(dst_offset); + UNUSED(src); + UNUSED(src_offset); + UNUSED(size); +} + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + return true; + + UNUSED(buffer); + UNUSED(src); + UNUSED(dst); +} + +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + + ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); +} + +static ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; + +static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; + + + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); +} + +static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 4096; +} + +static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 40960; +} + +static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + UNUSED(buft); + UNUSED(tensor); + return ggml_nbytes(tensor); +} + +static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, + }; + + return & buft; +} + +static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + UNUSED(dev); + UNUSED(op); + + return true; +} + +static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + UNUSED(dev); + UNUSED(buft); + return true; +} + + +static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + const int min_batch_size = 32; + + return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + + UNUSED(dev); +} + +static const char * ggml_backend_remoting_name(ggml_backend_t backend) { + UNUSED(backend); + + return "API Remoting backend"; +} + +static void ggml_backend_remoting_free(ggml_backend_t backend) { + UNUSED(backend); +} + +static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + UNUSED(backend); + UNUSED(cgraph); + + return GGML_STATUS_SUCCESS; +} + +static ggml_backend_i ggml_backend_remoting_interface = { + /* .get_name = */ ggml_backend_remoting_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +static ggml_guid_t ggml_backend_remoting_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + + +static ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; + + ggml_backend_t remoting_backend = new ggml_backend { + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device), + /* .context = */ ctx, + }; + + return remoting_backend; +} + +// host buffer type + +static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_REMOTING_NAME "_Host"; + + UNUSED(buft); +} + +static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +# if 0 + ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); +#endif + UNUSED(buffer); +} + +static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + + void *ptr = nullptr; + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; + + return buffer; + UNUSED(buft); +} + +static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 4096; +} + +// Should be changed to return device-specific host buffer type +// but that probably requires changes in llama.cpp +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0), + /* .context = */ nullptr, + }; + + // Make sure device 0 is initialized + //ggml_remoting_instance_init(); + //ggml_remoting_get_device(0); + + return &ggml_backend_remoting_buffer_type_host; +} + +static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return ggml_backend_remoting_host_buffer_type(); +} + +static const struct ggml_backend_device_i ggml_backend_remoting_device_i = { + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { + ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; + + ctx->device = i; + ctx->name = GGML_REMOTING_NAME + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_remoting_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; + } + } + + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +int ggml_backend_remoting_get_device_count() { + return 1; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_remoting_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ nullptr, + }; + + RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); + return ® +} From cd5410fe75c3186885fea7d7464c40153f062eb5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 9 Apr 2025 13:47:19 +0200 Subject: [PATCH 005/117] .github: remove --- .../ISSUE_TEMPLATE/010-bug-compilation.yml | 87 - .github/ISSUE_TEMPLATE/011-bug-results.yml | 101 - .github/ISSUE_TEMPLATE/019-bug-misc.yml | 91 - .github/ISSUE_TEMPLATE/020-enhancement.yml | 51 - .github/ISSUE_TEMPLATE/030-research.yml | 52 - .github/ISSUE_TEMPLATE/040-refactor.yml | 28 - .github/ISSUE_TEMPLATE/config.yml | 11 - .github/actions/windows-setup-curl/action.yml | 25 - .github/labeler.yml | 86 - .github/pull_request_template.md | 1 - .github/workflows/bench.yml.disabled | 304 --- .github/workflows/build-linux-cross.yml | 124 -- .github/workflows/build.yml | 1797 ----------------- .github/workflows/close-issue.yml | 28 - .github/workflows/docker.yml | 175 -- .github/workflows/editorconfig.yml | 29 - .github/workflows/gguf-publish.yml | 44 - .github/workflows/labeler.yml | 17 - .../workflows/python-check-requirements.yml | 33 - .github/workflows/python-lint.yml | 30 - .github/workflows/python-type-check.yml | 40 - .github/workflows/server.yml | 237 --- 22 files changed, 3391 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/010-bug-compilation.yml delete mode 100644 .github/ISSUE_TEMPLATE/011-bug-results.yml delete mode 100644 .github/ISSUE_TEMPLATE/019-bug-misc.yml delete mode 100644 .github/ISSUE_TEMPLATE/020-enhancement.yml delete mode 100644 .github/ISSUE_TEMPLATE/030-research.yml delete mode 100644 .github/ISSUE_TEMPLATE/040-refactor.yml delete mode 100644 .github/ISSUE_TEMPLATE/config.yml delete mode 100644 .github/actions/windows-setup-curl/action.yml delete mode 100644 .github/labeler.yml delete mode 100644 .github/pull_request_template.md delete mode 100644 .github/workflows/bench.yml.disabled delete mode 100644 .github/workflows/build-linux-cross.yml delete mode 100644 .github/workflows/build.yml delete mode 100644 .github/workflows/close-issue.yml delete mode 100644 .github/workflows/docker.yml delete mode 100644 .github/workflows/editorconfig.yml delete mode 100644 .github/workflows/gguf-publish.yml delete mode 100644 .github/workflows/labeler.yml delete mode 100644 .github/workflows/python-check-requirements.yml delete mode 100644 .github/workflows/python-lint.yml delete mode 100644 .github/workflows/python-type-check.yml delete mode 100644 .github/workflows/server.yml diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml deleted file mode 100644 index b85bf5741e5a3..0000000000000 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: Bug (compilation) -description: Something goes wrong when trying to compile llama.cpp. -title: "Compile bug: " -labels: ["bug-unconfirmed", "compilation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the compilation of llama.cpp fails. - Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`. - If the compilation succeeds with ccache disabled you should be able to permanently fix the issue - by clearing `~/.cache/ccache` (on Linux). - - type: textarea - id: commit - attributes: - label: Git commit - description: Which commit are you trying to compile? - placeholder: | - $git rev-parse HEAD - 84a07a17b1b08cf2b9747c633a2372782848a27f - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. - placeholder: > - I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: command - attributes: - label: Compile command - description: > - Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml deleted file mode 100644 index 1ccef0793d45e..0000000000000 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ /dev/null @@ -1,101 +0,0 @@ -name: Bug (model use) -description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module). -title: "Eval bug: " -labels: ["bug-unconfirmed", "model evaluation"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for bug reports where the model evaluation results - (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - The `llama-cli` binary can be used for simple and reproducible model inference. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software are you running? (use `--version` to get a version string) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: true - - type: dropdown - id: backends - attributes: - label: GGML backends - description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] - multiple: true - validations: - required: true - - type: textarea - id: hardware - attributes: - label: Hardware - description: Which CPUs/GPUs are you using? - placeholder: > - e.g. Ryzen 5950X + 2x RTX 4090 - validations: - required: true - - type: textarea - id: model - attributes: - label: Models - description: > - Which model(s) at which quantization were you using when encountering the bug? - If you downloaded a GGUF file off of Huggingface, please provide a link. - placeholder: > - e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it. - If you can narrow down the bug to specific hardware, compile flags, or command line arguments, - that information would be very much appreciated by us. - placeholder: > - e.g. when I run llama-cli with -ngl 99 I get garbled outputs. - When I use -ngl 0 it works correctly. - Here are the exact commands that I used: ... - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: true diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml deleted file mode 100644 index 1904e31fdc436..0000000000000 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: Bug (misc.) -description: Something is not working the way it should (and it's not covered by any of the above cases). -title: "Misc. bug: " -labels: ["bug-unconfirmed"] -body: - - type: markdown - attributes: - value: > - Thanks for taking the time to fill out this bug report! - This issue template is intended for miscellaneous bugs that don't fit into any other category. - If you encountered the issue while using an external UI (e.g. ollama), - please reproduce your issue using one of the examples/binaries in this repository. - - type: textarea - id: version - attributes: - label: Name and Version - description: Which version of our software is affected? (You can use `--version` to get a version string.) - placeholder: | - $./llama-cli --version - version: 2999 (42b4109e) - built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu - validations: - required: true - - type: dropdown - id: operating-system - attributes: - label: Operating systems - description: Which operating systems do you know to be affected? - multiple: true - options: - - Linux - - Mac - - Windows - - BSD - - Other? (Please let us know in description) - validations: - required: false - - type: dropdown - id: module - attributes: - label: Which llama.cpp modules do you know to be affected? - multiple: true - options: - - Documentation/Github - - libllama (core library) - - llama-cli - - llama-server - - llama-bench - - llama-quantize - - Python/Bash scripts - - Test code - - Other (Please specify in the next section) - validations: - required: false - - type: textarea - id: command - attributes: - label: Command line - description: > - Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false - - type: textarea - id: info - attributes: - label: Problem description & steps to reproduce - description: > - Please give us a summary of the problem and tell us how to reproduce it (if applicable). - validations: - required: true - - type: textarea - id: first_bad_commit - attributes: - label: First Bad Commit - description: > - If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing? - If possible, please do a git bisect and identify the exact commit that introduced the bug. - validations: - required: false - - type: textarea - id: logs - attributes: - label: Relevant log output - description: > - If applicable, please copy and paste any relevant log output, including any generated text. - This will be automatically formatted into code, so no need for backticks. - render: shell - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml deleted file mode 100644 index cee1446f5a097..0000000000000 --- a/.github/ISSUE_TEMPLATE/020-enhancement.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Enhancement -description: Used to request enhancements for llama.cpp. -title: "Feature Request: " -labels: ["enhancement"] -body: - - type: markdown - attributes: - value: | - [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas) - - - type: checkboxes - id: prerequisites - attributes: - label: Prerequisites - description: Please confirm the following before submitting your enhancement request. - options: - - label: I am running the latest code. Mention the version if possible as well. - required: true - - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md). - required: true - - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). - required: true - - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share. - required: true - - - type: textarea - id: feature-description - attributes: - label: Feature Description - description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. - placeholder: Detailed description of the enhancement - validations: - required: true - - - type: textarea - id: motivation - attributes: - label: Motivation - description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. - placeholder: Explanation of why this feature is needed and its benefits - validations: - required: true - - - type: textarea - id: possible-implementation - attributes: - label: Possible Implementation - description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. - placeholder: Detailed description of potential implementation - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml deleted file mode 100644 index e774550d5908c..0000000000000 --- a/.github/ISSUE_TEMPLATE/030-research.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: Research -description: Track new technical research area. -title: "Research: " -labels: ["research 🔬"] -body: - - type: markdown - attributes: - value: | - Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22) - - - type: checkboxes - id: research-stage - attributes: - label: Research Stage - description: Track general state of this research ticket - options: - - label: Background Research (Let's try to avoid reinventing the wheel) - - label: Hypothesis Formed (How do you think this will work and it's effect?) - - label: Strategy / Implementation Forming - - label: Analysis of results - - label: Debrief / Documentation (So people in the future can learn from us) - - - type: textarea - id: background - attributes: - label: Previous existing literature and research - description: Whats the current state of the art and whats the motivation for this research? - - - type: textarea - id: hypothesis - attributes: - label: Hypothesis - description: How do you think this will work and it's effect? - - - type: textarea - id: implementation - attributes: - label: Implementation - description: Got an approach? e.g. a PR ready to go? - - - type: textarea - id: analysis - attributes: - label: Analysis - description: How does the proposed implementation behave? - - - type: textarea - id: logs - attributes: - label: Relevant log output - description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. - render: shell diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml deleted file mode 100644 index 2fe94e26c6988..0000000000000 --- a/.github/ISSUE_TEMPLATE/040-refactor.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Refactor (Maintainers) -description: Used to track refactoring opportunities. -title: "Refactor: " -labels: ["refactor"] -body: - - type: markdown - attributes: - value: | - Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered. - Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too. - - - type: textarea - id: background-description - attributes: - label: Background Description - description: Please provide a detailed written description of the pain points you are trying to solve. - placeholder: Detailed description behind your motivation to request refactor - validations: - required: true - - - type: textarea - id: possible-approaches - attributes: - label: Possible Refactor Approaches - description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list. - placeholder: Your idea of possible refactoring opportunity/approaches - validations: - required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml deleted file mode 100644 index 0d246533c9515..0000000000000 --- a/.github/ISSUE_TEMPLATE/config.yml +++ /dev/null @@ -1,11 +0,0 @@ -blank_issues_enabled: true -contact_links: - - name: Got an idea? - url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas - about: Pop it there. It may then become an enhancement ticket. - - name: Got a question? - url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a - about: Ask a question there! - - name: Want to contribute? - url: https://github.com/ggml-org/llama.cpp/wiki/contribute - about: Head to the contribution guide page of the wiki for areas you can help with diff --git a/.github/actions/windows-setup-curl/action.yml b/.github/actions/windows-setup-curl/action.yml deleted file mode 100644 index 5d76da3d79ac5..0000000000000 --- a/.github/actions/windows-setup-curl/action.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: 'Windows - Setup CURL' -description: 'Composite action, to be reused in other workflow' -inputs: - curl_version: - description: 'CURL version' - required: false - default: '8.6.0_6' -outputs: - curl_path: - description: "Path to the downloaded libcurl" - value: ${{ steps.get_libcurl.outputs.curl_path }} - -runs: - using: "composite" - steps: - - name: libCURL - id: get_libcurl - shell: powershell - env: - CURL_VERSION: ${{ inputs.curl_version }} - run: | - curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" - mkdir $env:RUNNER_TEMP/libcurl - tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl - echo "curl_path=$env:RUNNER_TEMP/libcurl" >> $env:GITHUB_OUTPUT diff --git a/.github/labeler.yml b/.github/labeler.yml deleted file mode 100644 index 1b47bc96885c4..0000000000000 --- a/.github/labeler.yml +++ /dev/null @@ -1,86 +0,0 @@ -# https://github.com/actions/labeler -Kompute: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute/** - - README-kompute.md -Apple Metal: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-metal.h - - ggml/src/ggml-metal/** - - README-metal.md -SYCL: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-sycl.h - - ggml/src/ggml-sycl/** - - docs/backend/SYCL.md - - examples/sycl/** -Nvidia GPU: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-cuda.h - - ggml/src/ggml-cuda/** -Vulkan: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-vulkan.h - - ggml/src/ggml-vulkan/** -documentation: - - changed-files: - - any-glob-to-any-file: - - docs/** - - media/** -testing: - - changed-files: - - any-glob-to-any-file: - - tests/** -build: - - changed-files: - - any-glob-to-any-file: - - cmake/** - - CMakeLists.txt - - CMakePresets.json -examples: - - changed-files: - - any-glob-to-any-file: examples/** -devops: - - changed-files: - - any-glob-to-any-file: - - .devops/** - - .github/** - - ci/** -python: - - changed-files: - - any-glob-to-any-file: - - "**/*.py" - - requirements/** - - gguf-py/** - - .flake8 -script: - - changed-files: - - any-glob-to-any-file: - - scripts/** -android: - - changed-files: - - any-glob-to-any-file: - - examples/llama.android/** -server: - - changed-files: - - any-glob-to-any-file: - - examples/server/** -ggml: - - changed-files: - - any-glob-to-any-file: - - ggml/** -nix: - - changed-files: - - any-glob-to-any-file: - - "**/*.nix" - - .github/workflows/nix-*.yml - - .devops/nix/nixpkgs-instances.nix -embedding: - - changed-files: - - any-glob-to-any-file: examples/embedding/ diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index d0bdd73c4439c..0000000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1 +0,0 @@ -*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR* diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled deleted file mode 100644 index 75d2714792891..0000000000000 --- a/.github/workflows/bench.yml.disabled +++ /dev/null @@ -1,304 +0,0 @@ -# TODO: there have been some issues with the workflow, so disabling for now -# https://github.com/ggml-org/llama.cpp/issues/7893 -# -# Benchmark -name: Benchmark - -on: - workflow_dispatch: - inputs: - gpu-series: - description: 'Azure GPU series to run with' - required: true - type: choice - options: - - Standard_NC4as_T4_v3 - - Standard_NC24ads_A100_v4 - - Standard_NC80adis_H100_v5 - sha: - description: 'Commit SHA1 to build' - required: false - type: string - duration: - description: 'Duration of the bench' - type: string - default: 10m - - push: - branches: - - master - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - pull_request_target: - types: [opened, synchronize, reopened] - paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] - schedule: - - cron: '04 2 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} - cancel-in-progress: true - -jobs: - bench-server-baseline: - runs-on: Standard_NC4as_T4_v3 - env: - RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it - N_USERS: 8 - DURATION: 10m - - strategy: - matrix: - model: [phi-2] - ftype: [q4_0, q8_0, f16] - include: - - model: phi-2 - ftype: q4_0 - pr_comment_enabled: "true" - - if: | - inputs.gpu-series == 'Standard_NC4as_T4_v3' - || github.event_name == 'pull_request_target' - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Install python env - id: pipenv - run: | - cd examples/server/bench - python3 -m venv venv - source venv/bin/activate - pip install -r requirements.txt - - - name: Prometheus - id: install_prometheus - run: | - wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz - tar xzf prometheus*.tar.gz --strip-components=1 - ./prometheus --config.file=examples/server/bench/prometheus.yml & - while ! nc -z localhost 9090; do - sleep 0.1 - done - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version: '1.21' - - - name: Install k6 and xk6-sse - id: k6_installation - run: | - cd examples/server/bench - go install go.k6.io/xk6/cmd/xk6@latest - xk6 build master \ - --with github.com/phymbert/xk6-sse - - - name: Build - id: cmake_build - run: | - set -eux - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CUBLAS=ON \ - -DCUDAToolkit_ROOT=/usr/local/cuda \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ - -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DLLAMA_FATAL_WARNINGS=OFF \ - -DLLAMA_ALL_WARNINGS=OFF \ - -DCMAKE_BUILD_TYPE=Release; - cmake --build build --config Release -j $(nproc) --target llama-server - - - name: Download the dataset - id: download_dataset - run: | - cd examples/server/bench - wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - - name: Server bench - id: server_bench - env: - HEAD_REF: ${{ github.head_ref || github.ref_name }} - run: | - set -eux - - cd examples/server/bench - source venv/bin/activate - python bench.py \ - --runner-label ${{ env.RUNNER_LABEL }} \ - --name ${{ github.job }} \ - --branch $HEAD_REF \ - --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ - --scenario script.js \ - --duration ${{ github.event.inputs.duration || env.DURATION }} \ - --hf-repo ggml-org/models \ - --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ - --model-path-prefix /models \ - --parallel ${{ env.N_USERS }} \ - -ngl 33 \ - --batch-size 2048 \ - --ubatch-size 256 \ - --ctx-size 16384 \ - --n-prompts 1000 \ - --max-prompt-tokens 1024 \ - --max-tokens 2048 - - cat results.github.env >> $GITHUB_ENV - - # Remove dataset as we do not want it in the artefact - rm ShareGPT_V3_unfiltered_cleaned_split.json - - - uses: actions/upload-artifact@v4 - with: - name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - compression-level: 9 - path: | - examples/server/bench/*.jpg - examples/server/bench/*.json - examples/server/bench/*.log - - - name: Commit status - uses: Sibz/github-status-action@v1 - with: - authToken: ${{secrets.GITHUB_TOKEN}} - sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} - context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - description: | - ${{ env.BENCH_RESULTS }} - state: 'success' - - - name: Upload benchmark images - uses: devicons/public-upload-to-imgur@v2.2.2 - continue-on-error: true # Important as it looks unstable: 503 - id: imgur_step - with: - client_id: ${{secrets.IMGUR_CLIENT_ID}} - path: | - examples/server/bench/prompt_tokens_seconds.jpg - examples/server/bench/predicted_tokens_seconds.jpg - examples/server/bench/kv_cache_usage_ratio.jpg - examples/server/bench/requests_processing.jpg - - - name: Extract mermaid - id: set_mermaid - run: | - set -eux - - cd examples/server/bench - PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) - echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) - echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV - echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) - echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV - echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - REQUESTS_PROCESSING=$(cat requests_processing.mermaid) - echo "REQUESTS_PROCESSING<> $GITHUB_ENV - echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV - - - name: Extract image url - id: extract_image_url - continue-on-error: true - run: | - set -eux - - echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV - echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV - echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV - echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV - - - name: Comment PR - uses: mshick/add-pr-comment@v2 - id: comment_pr - if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} - with: - message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} - message: | -

- - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - -

- -
- - Expand details for performance related PR only - - - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s - - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s - - ${{ env.BENCH_GRAPH_XLABEL }} - - -

- - prompt_tokens_seconds - -

- - More - - ```mermaid - ${{ env.PROMPT_TOKENS_SECONDS }} - ``` - -
- - predicted_tokens_seconds - -
- More - - ```mermaid - ${{ env.PREDICTED_TOKENS_SECONDS }} - ``` - -
- -

- -
- - Details - -

- - kv_cache_usage_ratio - -

- More - - ```mermaid - ${{ env.KV_CACHE_USAGE_RATIO }} - ``` - -
- - requests_processing - -
- More - - ```mermaid - ${{ env.REQUESTS_PROCESSING }} - ``` - -
- -

-
-
diff --git a/.github/workflows/build-linux-cross.yml b/.github/workflows/build-linux-cross.yml deleted file mode 100644 index e8639913ea3a6..0000000000000 --- a/.github/workflows/build-linux-cross.yml +++ /dev/null @@ -1,124 +0,0 @@ -name: Build on Linux using cross-compiler -on: - workflow_dispatch: - workflow_call: - -jobs: - ubuntu-latest-riscv64-cpu-cross: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Setup Riscv - run: | - sudo dpkg --add-architecture riscv64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - build-essential \ - gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu \ - libcurl4-openssl-dev:riscv64 - - - name: Build - run: | - cmake -B build -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-latest-riscv64-vulkan-cross: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup Riscv - run: | - sudo dpkg --add-architecture riscv64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - gcc-14-riscv64-linux-gnu \ - g++-14-riscv64-linux-gnu \ - libvulkan-dev:riscv64 \ - libcurl4-openssl-dev:riscv64 - - - name: Build - run: | - cmake -B build -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=riscv64 \ - -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \ - -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) - - ubuntu-latest-arm64-vulkan-cross: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Setup Arm64 - run: | - sudo dpkg --add-architecture arm64 - sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \ - /etc/apt/sources.list /etc/apt/apt-mirrors.txt - sudo apt-get clean - sudo apt-get update - sudo apt-get install -y --no-install-recommends \ - build-essential \ - glslc \ - crossbuild-essential-arm64 \ - libvulkan-dev:arm64 \ - libcurl4-openssl-dev:arm64 - - - name: Build - run: | - cmake -B build -DCMAKE_BUILD_TYPE=Release \ - -DGGML_VULKAN=ON \ - -DGGML_OPENMP=OFF \ - -DLLAMA_BUILD_EXAMPLES=ON \ - -DLLAMA_BUILD_TESTS=OFF \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_PROCESSOR=aarch64 \ - -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \ - -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \ - -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ - -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH - - cmake --build build --config Release -j $(nproc) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index bcfcf08ac30b6..0000000000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,1797 +0,0 @@ -name: CI - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - contents: write # for creating release - -env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GGML_NLOOP: 3 - GGML_N_THREADS: 1 - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - -jobs: - macOS-latest-cmake-arm64: - runs-on: macos-14 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-arm64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip - name: llama-bin-macos-arm64.zip - - macOS-latest-cmake-x64: - runs-on: macos-13 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-x64 - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install curl - - - name: Build - id: cmake_build - run: | - sysctl -a - # Metal is disabled due to intermittent failures with Github runners not having a GPU: - # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build \ - -DCMAKE_BUILD_RPATH="@loader_path" \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_METAL=OFF \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip - name: llama-bin-macos-x64.zip - - ubuntu-cpu-cmake: - strategy: - matrix: - include: - - build: 'x64' - os: ubuntu-22.04 - - build: 'arm64' - os: ubuntu-22.04-arm - - runs-on: ${{ matrix.os }} - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-cpu-cmake - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L 'main|curl' --verbose --timeout 900 - - - name: Test llama2c conversion - id: llama2c_test - run: | - cd build - echo "Fetch tokenizer" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin - echo "Fetch llama2c model" - wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin - ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf - ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip - name: llama-bin-ubuntu-${{ matrix.build }}.zip - - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }} - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - if: ${{ matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DGGML_OPENMP=OFF - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-llguidance: - runs-on: ubuntu-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DLLAMA_LLGUIDANCE=ON - cmake --build . --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 - - ubuntu-latest-cmake-rpc: - runs-on: ubuntu-latest - - continue-on-error: true - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-rpc - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_RPC=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose - - ubuntu-22-cmake-vulkan: - runs-on: ubuntu-22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-vulkan - evict-old-files: 1d - - - name: Dependencies - id: depends - run: | - wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list - sudo apt-get update -y - sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev - - - name: Build - id: cmake_build - run: | - cmake -B build \ - -DGGML_VULKAN=ON - cmake --build build --config Release -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 2700 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - cp LICENSE ./build/bin/ - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip - name: llama-bin-ubuntu-vulkan-x64.zip - - ubuntu-22-cmake-hip: - runs-on: ubuntu-22.04 - container: rocm/dev-ubuntu-22.04:6.0.2 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-hip - evict-old-files: 1d - - - name: Build with native CMake HIP support - id: cmake_build - run: | - cmake -B build -S . \ - -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ - -DGGML_HIP_ROCWMMA_FATTN=ON \ - -DGGML_HIP=ON - cmake --build build --config Release -j $(nproc) - - - name: Build with legacy HIP support - id: cmake_build_legacy_hip - run: | - cmake -B build2 -S . \ - -DCMAKE_C_COMPILER=hipcc \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DGGML_HIP_ROCWMMA_FATTN=ON \ - -DGGML_HIP=ON - cmake --build build2 --config Release -j $(nproc) - - ubuntu-22-cmake-musa: - runs-on: ubuntu-22.04 - container: mthreads/musa:rc3.1.1-devel-ubuntu22.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - run: | - apt-get update - apt-get install -y build-essential git cmake libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-musa - evict-old-files: 1d - - - name: Build with native CMake MUSA support - id: cmake_build - run: | - cmake -B build -S . \ - -DGGML_MUSA=ON - cmake --build build --config Release -j $(nproc) - - ubuntu-22-cmake-sycl: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl - evict-old-files: 1d - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx - cmake --build build --config Release -j $(nproc) - - ubuntu-22-cmake-sycl-fp16: - runs-on: ubuntu-22.04 - - continue-on-error: true - - steps: - - uses: actions/checkout@v4 - - - name: add oneAPI to apt - shell: bash - run: | - cd /tmp - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB - sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" - - - name: install oneAPI dpcpp compiler - shell: bash - run: | - sudo apt update - sudo apt install intel-oneapi-compiler-dpcpp-cpp libcurl4-openssl-dev - - - name: install oneAPI MKL library - shell: bash - run: | - sudo apt install intel-oneapi-mkl-devel - - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-22-cmake-sycl-fp16 - evict-old-files: 1d - - - name: Build - id: cmake_build - run: | - source /opt/intel/oneapi/setvars.sh - cmake -B build \ - -DGGML_SYCL=ON \ - -DCMAKE_C_COMPILER=icx \ - -DCMAKE_CXX_COMPILER=icpx \ - -DGGML_SYCL_F16=ON - cmake --build build --config Release -j $(nproc) - - build-linux-cross: - uses: ./.github/workflows/build-linux-cross.yml - - macOS-latest-cmake-ios: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-ios - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-tvos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-cmake-tvos - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-cmake-visionos: - runs-on: macos-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_BUILD_COMMON=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - macOS-latest-swift: - runs-on: macos-latest - - strategy: - matrix: - destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: macOS-latest-swift - evict-old-files: 1d - - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - - - name: Build llama.cpp with CMake - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - - windows-msys2: - runs-on: windows-latest - - strategy: - fail-fast: false - matrix: - include: - - { sys: UCRT64, env: ucrt-x86_64, build: Release } - - { sys: CLANG64, env: clang-x86_64, build: Release } - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-msys2 - variant: sccache - evict-old-files: 1d - - - name: Setup ${{ matrix.sys }} - uses: msys2/setup-msys2@v2 - with: - update: true - msystem: ${{matrix.sys}} - install: >- - base-devel - git - mingw-w64-${{matrix.env}}-toolchain - mingw-w64-${{matrix.env}}-cmake - mingw-w64-${{matrix.env}}-openblas - - - name: Build using CMake - shell: msys2 {0} - run: | - cmake -B build - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - - name: Clean after building using CMake - shell: msys2 {0} - run: | - rm -rf build - - - name: Build using CMake w/ OpenBLAS - shell: msys2 {0} - run: | - cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config ${{ matrix.build }} -j $(nproc) - - windows-latest-cmake: - runs-on: windows-latest - - env: - OPENBLAS_VERSION: 0.3.23 - SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.4.309.0 - - strategy: - matrix: - include: - - build: 'noavx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF' - - build: 'avx2-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON' - - build: 'avx-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF' - - build: 'avx512-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON' - - build: 'openblas-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - - build: 'kompute-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' - - build: 'vulkan-x64' - defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON' - - build: 'llvm-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - - build: 'llvm-arm64-opencl-adreno' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-${{ matrix.build }} - variant: sccache - evict-old-files: 1d - - - name: Clone Kompute submodule - id: clone_kompute - if: ${{ matrix.build == 'kompute-x64' }} - run: | - git submodule update --init ggml/src/ggml-kompute/kompute - - - name: Download OpenBLAS - id: get_openblas - if: ${{ matrix.build == 'openblas-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" - curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/openblas - tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') - & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll - - - name: Install Vulkan SDK - id: get_vulkan - if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" - & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install - Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" - Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: Install OpenCL Headers and Libs - id: install_opencl - if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} - run: | - git clone https://github.com/KhronosGroup/OpenCL-Headers - cd OpenCL-Headers - cmake -B build ` - -DBUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_TESTING=OFF ` - -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build --target install - git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader - cd OpenCL-ICD-Loader - cmake -B build-arm64-release ` - -A arm64 ` - -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` - -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" - cmake --build build-arm64-release --target install --config release - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cmake -S . -B build ${{ matrix.defines }} ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - - - name: Add libopenblas.dll - id: add_libopenblas_dll - if: ${{ matrix.build == 'openblas-x64' }} - run: | - cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll - cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt - - - name: Check AVX512F support - id: check_avx512f - if: ${{ matrix.build == 'avx512-x64' }} - continue-on-error: true - run: | - cd build - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe') - echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c - & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main - .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO" - - - name: Test - id: cmake_test - # not all machines have native AVX-512 - if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} - run: | - cd build - ctest -L main -C Release --verbose --timeout 900 - - - name: Test (Intel SDE) - id: cmake_test_sde - if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation - run: | - curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" - # for some weird reason windows tar doesn't like sde tar.xz - 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz - 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar - $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) - cd build - $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 - & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - Copy-Item $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip - name: llama-bin-win-${{ matrix.build }}.zip - - ubuntu-latest-cmake-cuda: - runs-on: ubuntu-latest - container: nvidia/cuda:12.6.2-devel-ubuntu24.04 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install dependencies - env: - DEBIAN_FRONTEND: noninteractive - run: | - apt update - apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ubuntu-latest-cmake-cuda - evict-old-files: 1d - - - name: Build with CMake - run: | - cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CUDA_ARCHITECTURES=89-real \ - -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \ - -DLLAMA_FATAL_WARNINGS=ON \ - -DGGML_NATIVE=OFF \ - -DGGML_CUDA=ON - cmake --build build - - windows-2019-cmake-cuda: - runs-on: windows-2019 - - strategy: - matrix: - cuda: ['12.4', '11.7'] - build: ['cuda'] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }} - variant: sccache - evict-old-files: 1d - - - name: Install Cuda Toolkit 11.7 - if: ${{ matrix.cuda == '11.7' }} - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - - - name: Install Cuda Toolkit 12.4 - if: ${{ matrix.cuda == '12.4' }} - run: | - mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - choco install unzip -y - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" - curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" - unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 - - - name: Install Ninja - id: install_ninja - run: | - choco install ninja - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" - cmake -S . -B build -G "Ninja Multi-Config" ^ - -DLLAMA_BUILD_SERVER=ON ^ - -DGGML_NATIVE=OFF ^ - -DGGML_CUDA=ON ^ - -DGGML_RPC=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" - set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% -t ggml - cmake --build build --config Release - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\* - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip - name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - - - name: Copy and pack Cuda runtime - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} - run: | - echo "Cuda install location: ${{ env.CUDA_PATH }}" - $dst='.\build\bin\cudart\' - robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* - - - name: Upload Cuda runtime - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip - - windows-latest-cmake-sycl: - runs-on: windows-latest - - defaults: - run: - shell: bash - - env: - WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe - WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel - ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-sycl - variant: sccache - evict-old-files: 1d - - - name: Install - run: | - scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - - # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - - - name: Build - id: cmake_build - run: examples/sycl/win-build-sycl.bat - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Build the release package - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" - - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin - - cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin - cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin - - echo "cp oneAPI running time dll files to ./build/bin done" - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - - - name: Upload the release package - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip - name: llama-bin-win-sycl-x64.zip - - windows-latest-cmake-hip: - if: ${{ github.event.inputs.create_release != 'true' }} - runs-on: windows-latest - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - - - name: Clone rocWMMA repository - id: clone_rocwmma - run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: Install ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: ${{ github.job }} - evict-old-files: 1d - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` - -DCMAKE_BUILD_TYPE=Release ` - -DGGML_HIP=ON ` - -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_RPC=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - - # TODO: reuse windows-latest-cmake-hip instead of duplicating this job - windows-latest-cmake-hip-release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - runs-on: windows-latest - - strategy: - matrix: - gpu_target: [gfx1100, gfx1101, gfx1030] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Clone rocWMMA repository - id: clone_rocwmma - run: | - git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: windows-latest-cmake-hip-release - evict-old-files: 1d - - - name: Install - id: depends - run: | - $ErrorActionPreference = "Stop" - write-host "Downloading AMD HIP SDK Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" - write-host "Installing AMD HIP SDK" - Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait - write-host "Completed AMD HIP SDK installation" - - - name: Verify ROCm - id: verify - run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) - $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . ` - -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` - -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` - -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` - -DCMAKE_BUILD_TYPE=Release ` - -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` - -DGGML_HIP_ROCWMMA_FATTN=ON ` - -DGGML_HIP=ON ` - -DGGML_RPC=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} - md "build\bin\rocblas\library\" - cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" - cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - - ios-xcode-build: - runs-on: macos-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Build - id: cmake_build - run: | - sysctl -a - cmake -B build -G Xcode \ - -DGGML_METAL_USE_BF16=ON \ - -DGGML_METAL_EMBED_LIBRARY=ON \ - -DLLAMA_CURL=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF \ - -DLLAMA_BUILD_TESTS=OFF \ - -DLLAMA_BUILD_SERVER=OFF \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - - - name: xcodebuild for swift package - id: xcodebuild - run: | - ./build-xcframework.sh - - - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Pack artifacts - id: pack_artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - run: | - zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework - - - name: Upload artifacts - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v4 - with: - path: llama-${{ steps.tag.outputs.name }}-xcframework.zip - name: llama-${{ steps.tag.outputs.name }}-xcframework - - android-build: - runs-on: ubuntu-latest - - steps: - - name: Clone - uses: actions/checkout@v4 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: android-build - evict-old-files: 1d - - - name: Set up JDK - uses: actions/setup-java@v3 - with: - java-version: 17 - distribution: zulu - - - name: Setup Android SDK - uses: android-actions/setup-android@v3 - with: - log-accepted-android-sdk-licenses: false - - - name: Build - run: | - cd examples/llama.android - - ./gradlew build --no-daemon - - release: - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - - runs-on: ubuntu-latest - - needs: - - ubuntu-cpu-cmake - - ubuntu-22-cmake-vulkan - - windows-latest-cmake - - windows-2019-cmake-cuda - - windows-latest-cmake-sycl - - windows-latest-cmake-hip-release - - macOS-latest-cmake-arm64 - - macOS-latest-cmake-x64 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: ccache - uses: hendrikmuhs/ccache-action@v1.2.16 - with: - key: release - evict-old-files: 1d - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then - echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT - else - SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') - echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT - fi - - - name: Download artifacts - id: download-artifact - uses: actions/download-artifact@v4 - with: - path: ./artifact - - - name: Move artifacts - id: move_artifacts - run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release - - - name: Create release - id: create_release - uses: ggml-org/action-create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ steps.tag.outputs.name }} - - - name: Upload release - id: upload_release - uses: actions/github-script@v3 - with: - github-token: ${{secrets.GITHUB_TOKEN}} - script: | - const path = require('path'); - const fs = require('fs'); - const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./artifact/release')) { - if (path.extname(file) === '.zip') { - console.log('uploadReleaseAsset', file); - await github.repos.uploadReleaseAsset({ - owner: context.repo.owner, - repo: context.repo.repo, - release_id: release_id, - name: file, - data: await fs.readFileSync(`./artifact/release/${file}`) - }); - } - } - -# ubuntu-latest-gcc: -# runs-on: ubuntu-latest -# -# strategy: -# matrix: -# build: [Debug, Release] -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Dependencies -# run: | -# sudo apt-get update -# sudo apt-get install build-essential -# sudo apt-get install cmake -# -# - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -# -# - name: Build -# run: | -# make -# -# ubuntu-latest-clang: -# runs-on: ubuntu-latest -# -# strategy: -# matrix: -# build: [Debug, Release] -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Dependencies -# run: | -# sudo apt-get update -# sudo apt-get install build-essential -# sudo apt-get install cmake -# -# - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang -# -# - name: Build -# run: | -# make -# -# ubuntu-latest-gcc-sanitized: -# runs-on: ubuntu-latest -# -# strategy: -# matrix: -# sanitizer: [ADDRESS, THREAD, UNDEFINED] -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Dependencies -# run: | -# sudo apt-get update -# sudo apt-get install build-essential -# sudo apt-get install cmake -# -# - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -# -# - name: Build -# run: | -# make -# -# windows: -# runs-on: windows-latest -# -# strategy: -# matrix: -# build: [Release] -# arch: [Win32, x64] -# include: -# - arch: Win32 -# s2arc: x86 -# - arch: x64 -# s2arc: x64 -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Add msbuild to PATH -# uses: microsoft/setup-msbuild@v1 -# -# - name: Configure -# run: > -# cmake -S . -B ./build -A ${{ matrix.arch }} -# -DCMAKE_BUILD_TYPE=${{ matrix.build }} -# -# - name: Build -# run: | -# cd ./build -# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} -# -# - name: Upload binaries -# uses: actions/upload-artifact@v4 -# with: -# name: llama-bin-${{ matrix.arch }} -# path: build/bin/${{ matrix.build }} -# -# windows-blas: -# runs-on: windows-latest -# -# strategy: -# matrix: -# build: [Release] -# arch: [Win32, x64] -# blas: [ON] -# include: -# - arch: Win32 -# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x86.zip -# s2arc: x86 -# - arch: x64 -# obzip: https://github.com/xianyi/OpenBLAS/releases/download/v0.3.21/OpenBLAS-0.3.21-x64.zip -# s2arc: x64 -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Add msbuild to PATH -# uses: microsoft/setup-msbuild@v1 -# -# - name: Fetch OpenBLAS -# if: matrix.blas == 'ON' -# run: | -# C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }} -# 7z x blas.zip -oblas -y -# copy blas/include/cblas.h . -# copy blas/include/openblas_config.h . -# echo "blasdir=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV -# -# - name: Configure -# run: > -# cmake -S . -B ./build -A ${{ matrix.arch }} -# -DCMAKE_BUILD_TYPE=${{ matrix.build }} -# -DLLAMA_SUPPORT_OPENBLAS=${{ matrix.blas }} -# -DCMAKE_LIBRARY_PATH="$env:blasdir/lib" -# -# - name: Build -# run: | -# cd ./build -# msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} -# -# - name: Copy libopenblas.dll -# if: matrix.blas == 'ON' -# run: copy "$env:blasdir/bin/libopenblas.dll" build/bin/${{ matrix.build }} -# -# - name: Upload binaries -# if: matrix.blas == 'ON' -# uses: actions/upload-artifact@v4 -# with: -# name: llama-blas-bin-${{ matrix.arch }} -# path: build/bin/${{ matrix.build }} -# -# emscripten: -# runs-on: ubuntu-latest -# -# strategy: -# matrix: -# build: [Release] -# -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Dependencies -# run: | -# wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz -# tar -xvf master.tar.gz -# emsdk-master/emsdk update -# emsdk-master/emsdk install latest -# emsdk-master/emsdk activate latest -# -# - name: Configure -# run: echo "tmp" -# -# - name: Build -# run: | -# pushd emsdk-master -# source ./emsdk_env.sh -# popd -# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} -# make - - openEuler-latest-cmake-cann: - if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }} - defaults: - run: - shell: bash -el {0} - runs-on: ubuntu-24.04-arm - strategy: - matrix: - cann: - - '8.1.RC1.alpha001-910b-openeuler22.03-py3.10' - device: - - 'ascend910b3' - build: - - 'Release' - container: ascendai/cann:${{ matrix.cann }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Dependencies - run: | - yum update -y - yum install -y git gcc gcc-c++ make cmake libcurl-devel - - - name: Build - run: | - export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH} - - cmake -S . -B build \ - -DCMAKE_BUILD_TYPE=${{ matrix.build }} \ - -DGGML_CANN=on \ - -DSOC_TYPE=${{ matrix.device }} - cmake --build build -j $(nproc) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml deleted file mode 100644 index 276a217d45005..0000000000000 --- a/.github/workflows/close-issue.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Close inactive issues -on: - schedule: - - cron: "42 0 * * *" - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - issues: write - -jobs: - close-issues: - runs-on: ubuntu-latest - permissions: - issues: write - pull-requests: write - steps: - - uses: actions/stale@v5 - with: - exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap" - days-before-issue-stale: 30 - days-before-issue-close: 14 - stale-issue-label: "stale" - close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." - days-before-pr-stale: -1 - days-before-pr-close: -1 - operations-per-run: 10000 - repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml deleted file mode 100644 index 9eba3f6a42b5e..0000000000000 --- a/.github/workflows/docker.yml +++ /dev/null @@ -1,175 +0,0 @@ -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -# GitHub recommends pinning actions to a commit SHA. -# To get a newer version, you will need to update the SHA. -# You can also reference a tag or branch, but the action may change without warning. - -name: Publish Docker image - -on: - workflow_dispatch: # allows manual triggering - schedule: - # Rebuild daily rather than on every push because it is expensive - - cron: '12 4 * * *' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -# Fine-grant permission -# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token -permissions: - packages: write - -jobs: - push_to_registry: - name: Push Docker image to Docker Hub - - runs-on: ubuntu-22.04 - env: - COMMIT_SHA: ${{ github.sha }} - strategy: - fail-fast: false - matrix: - config: - # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true} - - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } - steps: - - name: Check out the repo - uses: actions/checkout@v4 - with: - fetch-depth: 0 # preserve git history, so we can determine the build number - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - with: - image: tonistiigi/binfmt:qemu-v7.0.0-28 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Determine tag name - id: tag - shell: bash - run: | - BUILD_NUMBER="$(git rev-list --count HEAD)" - SHORT_HASH="$(git rev-parse --short=7 HEAD)" - REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case - REPO_NAME="${{ github.event.repository.name }}" - - # determine tag name postfix (build number, commit hash) - if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="-b${BUILD_NUMBER}" - else - SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" - fi - # list all tags possible - if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then - TYPE="" - else - TYPE="-${{ matrix.config.tag }}" - fi - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" - LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" - SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" - echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT - echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT - echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT - echo "full_output_tags=$FULLTAGS" # print out for debugging - echo "light_output_tags=$LIGHTTAGS" # print out for debugging - echo "server_output_tags=$SERVERTAGS" # print out for debugging - env: - GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} - GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - - - name: Free Disk Space (Ubuntu) - if: ${{ matrix.config.free_disk_space == true }} - uses: ggml-org/free-disk-space@v1.3.1 - with: - # this might remove tools that are actually needed, - # if set to "true" but frees about 6 GB - tool-cache: false - - # all of these default to true, but feel free to set to - # "false" if necessary for your workflow - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - - name: Build and push Full Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.full_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: full - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - - - name: Build and push Light Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.light_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: light - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache - - - name: Build and push Server Docker image (tagged + versioned) - if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} - uses: docker/build-push-action@v6 - with: - context: . - push: true - platforms: ${{ matrix.config.platforms }} - # tag list is generated from step above - tags: ${{ steps.tag.outputs.server_output_tags }} - file: ${{ matrix.config.dockerfile }} - target: server - provenance: false - # using github experimental cache - cache-from: type=gha - cache-to: type=gha,mode=max - # return to this if the experimental github cache is having issues - #cache-to: type=local,dest=/tmp/.buildx-cache - #cache-from: type=local,src=/tmp/.buildx-cache diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index f02b7c2194bcf..0000000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: EditorConfig Checker - -on: - workflow_dispatch: # allows manual triggering - inputs: - create_release: - description: 'Create new release' - required: true - type: boolean - push: - branches: - - master - pull_request: - branches: - - master - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@v2 - with: - version: v3.0.3 - - run: editorconfig-checker diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml deleted file mode 100644 index 3ca4d30581074..0000000000000 --- a/.github/workflows/gguf-publish.yml +++ /dev/null @@ -1,44 +0,0 @@ -# This workflow will upload a Python Package using Twine when a GGUF release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - -# See `gguf-py/README.md` for how to make a release. - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -name: Upload Python Package - -on: - workflow_dispatch: - push: - # Pattern matched against refs/tags - tags: - - 'gguf-v*' # Push events to every version tag - - -jobs: - deploy: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.9.x' - - name: Install dependencies - run: | - cd gguf-py - python -m pip install poetry - poetry install - - - name: Build package - run: cd gguf-py && poetry build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - packages-dir: gguf-py/dist diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index 0b0f300aa402a..0000000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: "Pull Request Labeler" -on: -- pull_request_target - -jobs: - labeler: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - repository: "ggml-org/llama.cpp" - - uses: actions/labeler@v5 - with: - configuration-path: '.github/labeler.yml' diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml deleted file mode 100644 index 46e80aecd0a0c..0000000000000 --- a/.github/workflows/python-check-requirements.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Python check requirements.txt - -on: - push: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - '**/requirements*.txt' - pull_request: - paths: - - '.github/workflows/python-check-requirements.yml' - - 'scripts/check-requirements.sh' - - 'convert*.py' - - '**/requirements*.txt' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - python-check-requirements: - runs-on: ubuntu-latest - name: check-requirements - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml deleted file mode 100644 index ddfdf73b8fce2..0000000000000 --- a/.github/workflows/python-lint.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: flake8 Lint - -on: - push: - branches: - - master - paths: ['.github/workflows/python-lint.yml', '**/*.py'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/python-lint.yml', '**/*.py'] - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - flake8-lint: - runs-on: ubuntu-latest - name: Lint - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: flake8 Lint - uses: py-actions/flake8@v2 - with: - plugins: "flake8-no-print" diff --git a/.github/workflows/python-type-check.yml b/.github/workflows/python-type-check.yml deleted file mode 100644 index 373bb601020b2..0000000000000 --- a/.github/workflows/python-type-check.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Python Type-Check - -on: - push: - paths: - - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - - '**.py' - - '**/requirements*.txt' - pull_request: - paths: - - '.github/workflows/python-type-check.yml' - - 'pyrightconfig.json' - - '**.py' - - '**/requirements*.txt' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} - cancel-in-progress: true - -jobs: - python-type-check: - runs-on: ubuntu-latest - name: pyright type-check - steps: - - name: Check out source repository - uses: actions/checkout@v4 - - name: Set up Python environment - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - name: Install Python dependencies - # TODO: use a venv - run: pip install -r requirements/requirements-all.txt - - name: Type-check with Pyright - uses: jakebailey/pyright-action@v2 - with: - version: 1.1.382 - level: warning - warnings: true diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml deleted file mode 100644 index 6c9b5132276fe..0000000000000 --- a/.github/workflows/server.yml +++ /dev/null @@ -1,237 +0,0 @@ -# Server build and tests -name: Server - -on: - workflow_dispatch: # allows manual triggering - inputs: - sha: - description: 'Commit SHA1 to build' - required: false - type: string - slow_tests: - description: 'Run slow tests' - required: true - type: boolean - push: - branches: - - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - pull_request: - types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] - -env: - LLAMA_LOG_COLORS: 1 - LLAMA_LOG_PREFIX: 1 - LLAMA_LOG_TIMESTAMPS: 1 - LLAMA_LOG_VERBOSITY: 10 - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -jobs: - server: - runs-on: ubuntu-latest - - strategy: - matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken - build_type: [RelWithDebInfo] - include: - - build_type: Release - sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - - steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libcurl4-openssl-dev - - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - # Setup nodejs (to be used for verifying bundled index.html) - - uses: actions/setup-node@v4 - with: - node-version: '22.11.0' - - - name: WebUI - Install dependencies - id: webui_lint - run: | - cd examples/server/webui - npm ci - - - name: WebUI - Check code format - id: webui_format - run: | - git config --global --add safe.directory $(realpath .) - cd examples/server/webui - git status - - npm run format - git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Files do not follow coding style. To fix: npm run format" - echo "${modified_files}" - exit 1 - fi - - - name: Verify bundled index.html - id: verify_server_index_html - run: | - git config --global --add safe.directory $(realpath .) - cd examples/server/webui - git status - - npm run build - git status - modified_files="$(git status -s)" - echo "Modified files: ${modified_files}" - if [ -n "${modified_files}" ]; then - echo "Repository is dirty or server/webui is not built as expected" - echo "Hint: You may need to follow Web UI build guide in server/README.md" - echo "${modified_files}" - exit 1 - fi - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build_sanitizers - if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build - if: ${{ matrix.sanitizer == '' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Tests - id: server_integration_tests - if: ${{ matrix.sanitizer == '' }} - env: - GITHUB_ACTIONS: "true" - run: | - cd examples/server/tests - ./tests.sh - - - name: Tests (sanitizers) - id: server_integration_tests_sanitizers - if: ${{ matrix.sanitizer != '' }} - run: | - cd examples/server/tests - LLAMA_SANITIZE=1 ./tests.sh - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - SLOW_TESTS=1 ./tests.sh - - - server-windows: - runs-on: windows-2019 - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - - name: Build - id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - - - name: Python setup - id: setup_python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r examples/server/tests/requirements.txt - - - name: Copy Libcurl - id: prepare_libcurl - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} - run: | - cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll - - - name: Tests - id: server_integration_tests - if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} - run: | - cd examples/server/tests - $env:PYTHONIOENCODING = ":replace" - pytest -v -x -m "not slow" - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd examples/server/tests - $env:SLOW_TESTS = "1" - pytest -v -x From b4b90abe597de9bdde0f933f58b3cd25b0a9510d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 10 Apr 2025 12:27:14 +0200 Subject: [PATCH 006/117] CMakeLists: add the ggml files and include Mesa files --- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 4ab2aaa0ac340..63098a431b0a5 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -12,9 +12,27 @@ cmake_policy(SET CMP0114 NEW) message(STATUS "Enable API Remoting frontend found") ggml_add_backend_library(ggml-remotingfrontend - ggml-remoting-frontend.cpp + ggml-backend-buffer.cpp + ggml-backend.cpp + ggml-backend-device.cpp + ggml-backend-reg.cpp + ggml-buffer-type.cpp + ggml-host-buffer-type.cpp + virtgpu.cpp ../../include/ggml-remoting-frontend.h ) -#target_link_libraries(ggml-remotingfrontend PRIVATE remotingfrontend) +target_link_libraries(ggml-remotingfrontend PUBLIC drm) +target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/) + +set(REMOTING_PROJECT /Users/kevinpouget/remoting) +set(MESA_PROJECT_HOME ${REMOTING_PROJECT}/mesa) +set(MESA_PROJECT_SRC ${MESA_PROJECT_HOME}/src) + +target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_SRC}/virtio/virtio-gpu/) +target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_HOME}/include) target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) + +target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20) + +# dnf install -y libdrm-devel From 53b42a8a0ea52752a8569689722ebcd9adf9d163 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 10 Apr 2025 12:28:03 +0200 Subject: [PATCH 007/117] ggml-*: move the ggml interfaces to a dedicated file --- .../ggml-backend-buffer.cpp | 39 ++ .../ggml-backend-device.cpp | 81 +++ .../ggml-backend-reg.cpp | 69 +++ .../ggml-remotingfrontend/ggml-backend.cpp | 54 ++ .../ggml-buffer-type.cpp | 158 ++++++ .../ggml-host-buffer-type.cpp | 55 ++ .../ggml-remoting-frontend.cpp | 485 +----------------- .../src/ggml-remotingfrontend/ggml-remoting.h | 61 +++ 8 files changed, 523 insertions(+), 479 deletions(-) create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp create mode 100644 ggml/src/ggml-remotingfrontend/ggml-remoting.h diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp new file mode 100644 index 0000000000000..638203252a86d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -0,0 +1,39 @@ +#include + +#include "ggml-remoting.h" + +void ggml_remoting_destroy_buffer(remoting_buffer& buf) { + UNUSED(buf); +} + +static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) { + UNUSED(dst); + UNUSED(offset); + UNUSED(src); + UNUSED(size); +} + +static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) { + UNUSED(src); + UNUSED(offset); + UNUSED(dst); + UNUSED(size); +} + +static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) { + UNUSED(ctx); + UNUSED(dst); + UNUSED(dst_offset); + UNUSED(src); + UNUSED(src_offset); + UNUSED(size); +} + +static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp new file mode 100644 index 0000000000000..b18ce03a37121 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -0,0 +1,81 @@ +#include "ggml-remoting.h" + +static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + UNUSED(dev); + return "API Remoting"; +} + +static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + UNUSED(dev); + return "API Remoting device"; +} + +static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + UNUSED(device); + *total = 1024*1024*1024; + *free = *total; +} + +static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + UNUSED(dev); + UNUSED(op); + + return true; +} + +static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + UNUSED(dev); + UNUSED(buft); + return true; +} + +static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + const int min_batch_size = 32; + + return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + + UNUSED(dev); +} + +static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return ggml_backend_remoting_host_buffer_type(); +} + + +static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +const struct ggml_backend_device_i ggml_backend_remoting_device_i = { + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp new file mode 100644 index 0000000000000..00dddf23f2898 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -0,0 +1,69 @@ +#include +#include + +#include "ggml-remoting.h" + +static int ggml_backend_remoting_get_device_count() { + return 1; +} + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return ggml_backend_remoting_get_device_count(); +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; + + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + + create_virtgpu(); + + for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { + ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; + + ctx->device = i; + ctx->name = GGML_REMOTING_NAME + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_remoting_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; + } + } + + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_REMOTING_NAME; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_remoting_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ nullptr, + }; + + RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); + return ® +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp new file mode 100644 index 0000000000000..2618e48929cba --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -0,0 +1,54 @@ +#include "ggml-remoting.h" + +static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { + UNUSED(backend); + + return "API Remoting backend"; +} + +static void ggml_backend_remoting_free(ggml_backend_t backend) { + UNUSED(backend); +} + +static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + UNUSED(backend); + UNUSED(cgraph); + + return GGML_STATUS_SUCCESS; +} + +static ggml_backend_i ggml_backend_remoting_interface = { + /* .get_name = */ ggml_backend_remoting_get_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_remoting_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + + +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; + + ggml_backend_t remoting_backend = new ggml_backend { + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device), + /* .context = */ ctx, + }; + + return remoting_backend; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp new file mode 100644 index 0000000000000..3d882110b9962 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp @@ -0,0 +1,158 @@ +#include "ggml-remoting.h" + +extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; + +struct ggml_backend_remoting_buffer_type_context { + std::string name; +}; + + +static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + return "Remoting buffer"; +} + +static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; + + + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); +} + +static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 4096; +} + +static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 40960; +} + +static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + UNUSED(buft); + UNUSED(tensor); + return ggml_nbytes(tensor); +} + +static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, + /* .is_host = */ NULL, +}; + +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, + }; + + return & buft; +} + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + ggml_remoting_destroy_buffer(ctx->dev_buffer); + delete ctx; +} + +static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + } + return GGML_STATUS_SUCCESS; +} + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *) 4096; + + UNUSED(buffer); +} + +static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + UNUSED(buffer); + UNUSED(tensor); + UNUSED(value); + UNUSED(offset); + UNUSED(size); +} + + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +#if 0 + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + remoting_buffer buf = buf_ctx->dev_buffer; + + ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +#else + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); +#endif +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +#if 0 + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + + remoting_buffer buf = buf_ctx->dev_buffer; + + ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); +#else + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); +#endif +} + + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + return true; + + UNUSED(buffer); + UNUSED(src); + UNUSED(dst); +} + +static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + UNUSED(dst); + UNUSED(c); + UNUSED(size); + UNUSED(offset); +} + +static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + UNUSED(ctx); + UNUSED(dst); + UNUSED(c); + UNUSED(size); + UNUSED(offset); +} + +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + + ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); +} + +ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp new file mode 100644 index 0000000000000..b40c72b8d1e8b --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp @@ -0,0 +1,55 @@ +#include "ggml-remoting.h" + +// host buffer type + +static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_REMOTING_NAME "_Host"; + + UNUSED(buft); +} + +static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +# if 0 + ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); +#endif + UNUSED(buffer); +} + +static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + + void *ptr = nullptr; + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; + + return buffer; + UNUSED(buft); +} + +static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + return 4096; +} + +// Should be changed to return device-specific host buffer type +// but that probably requires changes in llama.cpp +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0), + /* .context = */ nullptr, + }; + + // Make sure device 0 is initialized + //ggml_remoting_instance_init(); + //ggml_remoting_get_device(0); + + return &ggml_backend_remoting_buffer_type_host; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp index 4c7c1f1dc8f95..87679fe59a8d3 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.cpp @@ -1,499 +1,26 @@ -#include "ggml-remoting-frontend.h" - #include #include #include #include #include #include +#include +#include +#include + +#include "ggml-remoting-frontend.h" +#include "remoting.h" #include "ggml-impl.h" #include "ggml-backend-impl.h" -#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl -#define UNUSED GGML_UNUSED int ggml_backend_remoting_get_device_count(); -ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); - -static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT - - -struct ggml_backend_remoting_buffer_type_context { - std::string name; -}; - -struct remoting_context_struct { - int i; -}; -typedef std::shared_ptr remoting_context; -typedef std::weak_ptr remoting_context_ref; - -static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { - UNUSED(reg); - return ggml_backend_remoting_get_device_count(); -} - -static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { - UNUSED(reg); - return GGML_REMOTING_NAME; -} - -struct ggml_backend_remoting_device_context { - size_t device; - std::string name; - std::string description; -}; - -static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { - UNUSED(dev); - return "API Remoting"; -} -static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { - UNUSED(dev); - return "API Remoting device"; -} -static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; -} -static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - UNUSED(device); - *total = 1024*1024*1024; - *free = *total; -} struct remoting_device_struct { std::mutex mutex; }; - -struct remoting_device_struct; -typedef std::shared_ptr remoting_device; -typedef std::weak_ptr remoting_device_ref; - -struct remoting_buffer_struct; -typedef std::shared_ptr remoting_buffer; -typedef std::weak_ptr remoting_buffer_ref; - -// vk buffer type -static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - - return "Remoting buffer"; -} - -static void ggml_remoting_destroy_buffer(remoting_buffer& buf) { - UNUSED(buf); -} - - -static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - -static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - UNUSED(ctx); - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - - -static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) { - if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; - } - return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; -} - -struct ggml_backend_remoting_buffer_context { - remoting_device_ref device; - remoting_buffer dev_buffer; - std::string name; - - ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : - name(name) { - UNUSED(device); - UNUSED(dev_buffer); - } - - ~ggml_backend_remoting_buffer_context() { - ggml_remoting_destroy_buffer(dev_buffer); - } -}; - -static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - ggml_remoting_destroy_buffer(ctx->dev_buffer); - delete ctx; -} - -static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *) 4096; - - UNUSED(buffer); -} - -static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - if (tensor->view_src != nullptr) { - GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - } - return GGML_STATUS_SUCCESS; -} - -static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - UNUSED(buffer); - UNUSED(tensor); - UNUSED(value); - UNUSED(offset); - UNUSED(size); -} - -static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) { - UNUSED(dst); - UNUSED(offset); - UNUSED(src); - UNUSED(size); -} - -static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) { - UNUSED(src); - UNUSED(offset); - UNUSED(dst); - UNUSED(size); -} - -static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { -#if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - remoting_buffer buf = buf_ctx->dev_buffer; - - ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -#else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); -#endif -} - -static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { -#if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - - remoting_buffer buf = buf_ctx->dev_buffer; - - ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -#else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); -#endif -} - -static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) { - UNUSED(ctx); - UNUSED(dst); - UNUSED(dst_offset); - UNUSED(src); - UNUSED(src_offset); - UNUSED(size); -} - -static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { - return true; - - UNUSED(buffer); - UNUSED(src); - UNUSED(dst); -} - -static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - - ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -} - -static ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { - /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, - /* .get_base = */ ggml_backend_remoting_buffer_get_base, - /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, - /* .clear = */ ggml_backend_remoting_buffer_clear, - /* .reset = */ NULL, -}; - -static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; - - - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); -} - -static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - return 4096; -} - -static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - return 40960; -} - -static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - UNUSED(buft); - UNUSED(tensor); - return ggml_nbytes(tensor); -} - -static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { - /* .get_name = */ ggml_backend_remoting_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, - /* .is_host = */ NULL, -}; - -static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { - - static struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_remoting_buffer_type_interface, - /* .device = */ dev, - /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, - }; - - return & buft; -} - -static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - UNUSED(dev); - UNUSED(op); - - return true; -} - -static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - UNUSED(dev); - UNUSED(buft); - return true; -} - - -static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; - - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - - UNUSED(dev); -} - -static const char * ggml_backend_remoting_name(ggml_backend_t backend) { - UNUSED(backend); - - return "API Remoting backend"; -} - -static void ggml_backend_remoting_free(ggml_backend_t backend) { - UNUSED(backend); -} - -static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - UNUSED(backend); - UNUSED(cgraph); - - return GGML_STATUS_SUCCESS; -} - -static ggml_backend_i ggml_backend_remoting_interface = { - /* .get_name = */ ggml_backend_remoting_name, - /* .free = */ ggml_backend_remoting_free, - /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, - /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, - /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, - /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_remoting_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_remoting_device_get_name(dev); - props->description = ggml_backend_remoting_device_get_description(dev); - props->type = ggml_backend_remoting_device_get_type(dev); - ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, - /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; -} - -static ggml_guid_t ggml_backend_remoting_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - return &guid; -} - - -static ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { - UNUSED(params); - ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; - - ggml_backend_t remoting_backend = new ggml_backend { - /* .guid = */ ggml_backend_remoting_guid(), - /* .interface = */ ggml_backend_remoting_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device), - /* .context = */ ctx, - }; - - return remoting_backend; -} - -// host buffer type - -static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_REMOTING_NAME "_Host"; - - UNUSED(buft); -} - -static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { -# if 0 - ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); -#endif - UNUSED(buffer); -} - -static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - - void *ptr = nullptr; - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; - - return buffer; - UNUSED(buft); -} - -static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - return 4096; -} - -// Should be changed to return device-specific host buffer type -// but that probably requires changes in llama.cpp -ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0), - /* .context = */ nullptr, - }; - - // Make sure device 0 is initialized - //ggml_remoting_instance_init(); - //ggml_remoting_get_device(0); - - return &ggml_backend_remoting_buffer_type_host; -} - -static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return ggml_backend_remoting_host_buffer_type(); -} - -static const struct ggml_backend_device_i ggml_backend_remoting_device_i = { - /* .get_name = */ ggml_backend_remoting_device_get_name, - /* .get_description = */ ggml_backend_remoting_device_get_description, - /* .get_memory = */ ggml_backend_remoting_device_get_memory, - /* .get_type = */ ggml_backend_remoting_device_get_type, - /* .get_props = */ ggml_backend_remoting_device_get_props, - /* .init_backend = */ ggml_backend_remoting_device_init, - /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_remoting_device_supports_op, - /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, - /* .offload_op = */ ggml_backend_remoting_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; - - static bool initialized = false; - - { - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { - for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { - ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; - char desc[256] = "API Remoting device"; - - ctx->device = i; - ctx->name = GGML_REMOTING_NAME + std::to_string(i); - ctx->description = desc; - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_remoting_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); - } - initialized = true; - } - } - - GGML_ASSERT(device < devices.size()); - return devices[device]; -} - -int ggml_backend_remoting_get_device_count() { - return 1; -} - -static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { - /* .get_name = */ ggml_backend_remoting_reg_get_name, - /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, - /* .get_device = */ ggml_backend_remoting_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_remoting_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_remoting_reg_i, - /* .context = */ nullptr, - }; - - RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); - return ® -} diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h new file mode 100644 index 0000000000000..c6acdf6cfe1c8 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +#include "ggml-remoting-frontend.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "virtgpu.h" + +#define UNUSED GGML_UNUSED + +#define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl + +struct ggml_backend_remoting_device_context { + size_t device; + std::string name; + std::string description; +}; + +extern const struct ggml_backend_device_i ggml_backend_remoting_device_i; + +ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); +ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev); +ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); + +struct remoting_buffer_struct; +typedef std::shared_ptr remoting_buffer; +typedef std::weak_ptr remoting_buffer_ref; + +void ggml_remoting_destroy_buffer(remoting_buffer& buf); + +struct remoting_device_struct; +typedef std::shared_ptr remoting_device; +typedef std::weak_ptr remoting_device_ref; + +struct ggml_backend_remoting_buffer_context { + remoting_device_ref device; + remoting_buffer dev_buffer; + std::string name; + + ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : + name(name) { + UNUSED(device); + UNUSED(dev_buffer); + } + + ~ggml_backend_remoting_buffer_context() { + ggml_remoting_destroy_buffer(dev_buffer); + } +}; + + +struct remoting_context_struct { + int i; +}; +typedef std::shared_ptr remoting_context; +typedef std::weak_ptr remoting_context_ref; From 5049b2fe4b0c6d39ad99d677addd5919e79ec7f1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 10 Apr 2025 12:28:17 +0200 Subject: [PATCH 008/117] run.vulkan.sh: allow running with GDB --- run.vulkan.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/run.vulkan.sh b/run.vulkan.sh index 7f44334290bbf..1cd38ea58ef52 100755 --- a/run.vulkan.sh +++ b/run.vulkan.sh @@ -1 +1,10 @@ -../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing" +#! /bin/bash + +if [[ ${1:-} == "gdb" ]]; then + prefix="gdb --args" +else + prefix="" +fi + +export VN_DEBUG=init +$prefix ../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing" From ffa659f27d76d22a4896b33dac3bc7b80dc76b1e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 10 Apr 2025 12:28:37 +0200 Subject: [PATCH 009/117] virtgpu: start integrating virt-gpu code --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 330 +++++++++++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.h | 171 +++++++++++ 2 files changed, 501 insertions(+) create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu.h diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp new file mode 100644 index 0000000000000..f73be2767527d --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -0,0 +1,330 @@ +#include +#include +#include + +#include "virtgpu.h" + +static inline void +virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) +{ + /* VIRTGPU_BLOB_MEM_GUEST allocates from the guest system memory. They are + * logically contiguous in the guest but are sglists (iovecs) in the host. + * That makes them slower to process in the host. With host process + * isolation, it also becomes impossible for the host to access sglists + * directly. + * + * While there are ideas (and shipped code in some cases) such as creating + * udmabufs from sglists, or having a dedicated guest heap, it seems the + * easiest way is to reuse VIRTGPU_BLOB_MEM_HOST3D. That is, when the + * renderer sees a request to export a blob where + * + * - blob_mem is VIRTGPU_BLOB_MEM_HOST3D + * - blob_flags is VIRTGPU_BLOB_FLAG_USE_MAPPABLE + * - blob_id is 0 + * + * it allocates a host shmem. + * + * supports_blob_id_0 has been enforced by mandated render server config. + */ + assert(gpu->capset.data.supports_blob_id_0); + gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; +} + +void +create_virtgpu() { + struct virtgpu *gpu = new struct virtgpu(); + + VkResult result = virtgpu_open(gpu); + GGML_ASSERT(result == VK_SUCCESS); + + result = virtgpu_init_params(gpu); + GGML_ASSERT(result == VK_SUCCESS); + + result = virtgpu_init_capset(gpu); + GGML_ASSERT(result == VK_SUCCESS); + + result = virtgpu_init_context(gpu); + GGML_ASSERT(result == VK_SUCCESS); + + virtgpu_init_shmem_blob_mem(gpu); +} + +static VkResult +virtgpu_open(struct virtgpu *gpu) +{ + drmDevicePtr devs[8]; + int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); + if (count < 0) { + INFO("failed to enumerate DRM devices"); + return VK_ERROR_INITIALIZATION_FAILED; + } + + VkResult result = VK_ERROR_INITIALIZATION_FAILED; + for (int i = 0; i < count; i++) { + result = virtgpu_open_device(gpu, devs[i]); + if (result == VK_SUCCESS) + break; + } + + drmFreeDevices(devs, count); + + return result; +} + +static VkResult +virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) +{ + bool supported_bus = false; + + switch (dev->bustype) { + case DRM_BUS_PCI: + if (dev->deviceinfo.pci->vendor_id == VIRTGPU_PCI_VENDOR_ID && + dev->deviceinfo.pci->device_id == VIRTGPU_PCI_DEVICE_ID) + supported_bus = true; + break; + case DRM_BUS_PLATFORM: + supported_bus = true; + break; + default: + break; + } + + if (!supported_bus || !(dev->available_nodes & (1 << DRM_NODE_RENDER))) { + if (VN_DEBUG(INIT)) { + const char *name = "unknown"; + for (uint32_t i = 0; i < DRM_NODE_MAX; i++) { + if (dev->available_nodes & (1 << i)) { + name = dev->nodes[i]; + break; + } + } + vn_log(gpu->instance, "skipping DRM device %s", name); + } + return VK_ERROR_INITIALIZATION_FAILED; + } + + const char *primary_path = dev->nodes[DRM_NODE_PRIMARY]; + const char *node_path = dev->nodes[DRM_NODE_RENDER]; + + int fd = open(node_path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + if (VN_DEBUG(INIT)) + vn_log(gpu->instance, "failed to open %s", node_path); + return VK_ERROR_INITIALIZATION_FAILED; + } + + drmVersionPtr version = drmGetVersion(fd); + if (!version || strcmp(version->name, "virtio_gpu") || + version->version_major != 0) { + if (VN_DEBUG(INIT)) { + if (version) { + vn_log(gpu->instance, "unknown DRM driver %s version %d", + version->name, version->version_major); + } else { + vn_log(gpu->instance, "failed to get DRM driver version"); + } + } + if (version) + drmFreeVersion(version); + close(fd); + return VK_ERROR_INITIALIZATION_FAILED; + } + + gpu->fd = fd; + + struct stat st; + if (stat(primary_path, &st) == 0) { + gpu->has_primary = true; + gpu->primary_major = major(st.st_rdev); + gpu->primary_minor = minor(st.st_rdev); + } else { + gpu->has_primary = false; + gpu->primary_major = 0; + gpu->primary_minor = 0; + } + stat(node_path, &st); + gpu->render_major = major(st.st_rdev); + gpu->render_minor = minor(st.st_rdev); + + gpu->bustype = dev->bustype; + if (dev->bustype == DRM_BUS_PCI) + gpu->pci_bus_info = *dev->businfo.pci; + + drmFreeVersion(version); + + if (VN_DEBUG(INIT)) + vn_log(gpu->instance, "using DRM device %s", node_path); + + return VK_SUCCESS; +} + +void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) +{ + if (instance) { + printf(""); + } + + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + + /* instance may be NULL or partially initialized */ +} + + + +static VkResult +virtgpu_init_context(struct virtgpu *gpu) +{ + assert(!gpu->capset.version); + const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to initialize context: %s", + strerror(errno)); + } + return VK_ERROR_INITIALIZATION_FAILED; + } + + return VK_SUCCESS; +} + +static VkResult +virtgpu_init_capset(struct virtgpu *gpu) +{ + gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS; + gpu->capset.version = 0; + + const int ret = + virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, + &gpu->capset.data, sizeof(gpu->capset.data)); + if (ret) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "failed to get venus v%d capset: %s", + gpu->capset.version, strerror(errno)); + } + return VK_ERROR_INITIALIZATION_FAILED; + } + + return VK_SUCCESS; +} + +static VkResult +virtgpu_init_params(struct virtgpu *gpu) +{ + const uint64_t required_params[] = { + VIRTGPU_PARAM_3D_FEATURES, VIRTGPU_PARAM_CAPSET_QUERY_FIX, + VIRTGPU_PARAM_RESOURCE_BLOB, VIRTGPU_PARAM_CONTEXT_INIT, + }; + uint64_t val; + for (uint32_t i = 0; i < ARRAY_SIZE(required_params); i++) { + val = virtgpu_ioctl_getparam(gpu, required_params[i]); + if (!val) { + if (VN_DEBUG(INIT)) { + vn_log(gpu->instance, "required kernel param %d is missing", + (int)required_params[i]); + } + return VK_ERROR_INITIALIZATION_FAILED; + } + } + + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_HOST_VISIBLE); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; + } else { + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_GUEST_VRAM); + if (val) { + gpu->bo_blob_mem = VIRTGPU_BLOB_MEM_GUEST_VRAM; + } + } + + if (!val) { + vn_log(gpu->instance, + "one of required kernel params (%d or %d) is missing", + (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM); + return VK_ERROR_INITIALIZATION_FAILED; + } + + /* Cross-device feature is optional. It enables sharing dma-bufs + * with other virtio devices, like virtio-wl or virtio-video used + * by ChromeOS VMs. Qemu doesn't support cross-device sharing. + */ + val = virtgpu_ioctl_getparam(gpu, VIRTGPU_PARAM_CROSS_DEVICE); + if (val) + gpu->supports_cross_device = true; + + /* implied by CONTEXT_INIT uapi */ + gpu->max_timeline_count = 64; + + return VK_SUCCESS; +} + + +static int +virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) +{ + return drmIoctl(gpu->fd, request, args); +} + +static int +virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id) +{ + struct drm_virtgpu_context_set_param ctx_set_params[3] = { + { + .param = VIRTGPU_CONTEXT_PARAM_CAPSET_ID, + .value = capset_id, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_NUM_RINGS, + .value = 64, + }, + { + .param = VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK, + .value = 0, /* don't generate drm_events on fence signaling */ + }, + }; + + struct drm_virtgpu_context_init args = { + .num_params = ARRAY_SIZE(ctx_set_params), + .pad = 0, + .ctx_set_params = (uintptr_t)&ctx_set_params, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_CONTEXT_INIT, &args); +} + +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size) +{ + struct drm_virtgpu_get_caps args = { + .cap_set_id = id, + .cap_set_ver = version, + .addr = (uintptr_t)capset, + .size = (__u32) capset_size, + .pad = 0, + }; + + return virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GET_CAPS, &args); +} + +static uint64_t +virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) +{ + /* val must be zeroed because kernel only writes the lower 32 bits */ + uint64_t val = 0; + struct drm_virtgpu_getparam args = { + .param = param, + .value = (uintptr_t)&val, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args); + return ret ? 0 : val; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h new file mode 100644 index 0000000000000..618fc5dc6e3b6 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -0,0 +1,171 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-remoting-frontend.h" +#define VIRGL_RENDERER_UNSTABLE_APIS 1 +#include "drm-uapi/virtgpu_drm.h" +#include "virglrenderer_hw.h" +#include "venus_hw.h" + +/* from src/virtio/vulkan/vn_renderer_virtgpu.c */ +#define VIRTGPU_PCI_VENDOR_ID 0x1af4 +#define VIRTGPU_PCI_DEVICE_ID 0x1050 +#define VIRTGPU_BLOB_MEM_GUEST_VRAM 0x0004 +#define VIRTGPU_PARAM_GUEST_VRAM 9 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define VN_DEBUG(what) true + +typedef enum VkResult { + VK_SUCCESS = 0, + VK_NOT_READY = 1, + VK_TIMEOUT = 2, + VK_EVENT_SET = 3, + VK_EVENT_RESET = 4, + VK_INCOMPLETE = 5, + VK_ERROR_OUT_OF_HOST_MEMORY = -1, + VK_ERROR_OUT_OF_DEVICE_MEMORY = -2, + VK_ERROR_INITIALIZATION_FAILED = -3, + VK_ERROR_DEVICE_LOST = -4, + VK_ERROR_MEMORY_MAP_FAILED = -5, + VK_ERROR_LAYER_NOT_PRESENT = -6, + VK_ERROR_EXTENSION_NOT_PRESENT = -7, + VK_ERROR_FEATURE_NOT_PRESENT = -8, + VK_ERROR_INCOMPATIBLE_DRIVER = -9, + VK_ERROR_TOO_MANY_OBJECTS = -10, + VK_ERROR_FORMAT_NOT_SUPPORTED = -11, + VK_ERROR_FRAGMENTED_POOL = -12, + VK_ERROR_UNKNOWN = -13, + VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000, + VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003, + VK_ERROR_FRAGMENTATION = -1000161000, + VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000, + VK_PIPELINE_COMPILE_REQUIRED = 1000297000, + VK_ERROR_SURFACE_LOST_KHR = -1000000000, + VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001, + VK_SUBOPTIMAL_KHR = 1000001003, + VK_ERROR_OUT_OF_DATE_KHR = -1000001004, + VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001, + VK_ERROR_VALIDATION_FAILED_EXT = -1000011001, + VK_ERROR_INVALID_SHADER_NV = -1000012000, + VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR = -1000023000, + VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR = -1000023001, + VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR = -1000023002, + VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR = -1000023003, + VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR = -1000023004, + VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR = -1000023005, + VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158000, + VK_ERROR_NOT_PERMITTED_KHR = -1000174001, + VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000, + VK_THREAD_IDLE_KHR = 1000268000, + VK_THREAD_DONE_KHR = 1000268001, + VK_OPERATION_DEFERRED_KHR = 1000268002, + VK_OPERATION_NOT_DEFERRED_KHR = 1000268003, + VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR = -1000299000, + VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000, + VK_INCOMPATIBLE_SHADER_BINARY_EXT = 1000482000, + VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY, + VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE, + VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION, + VK_ERROR_NOT_PERMITTED_EXT = VK_ERROR_NOT_PERMITTED_KHR, + VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, + VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS_KHR = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, + VK_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED, + VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED, + VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT = VK_INCOMPATIBLE_SHADER_BINARY_EXT, + VK_RESULT_MAX_ENUM = 0x7FFFFFFF +} VkResult; + + +struct remoting_dev_instance { + int yes; +}; + +#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a))) + +inline void +vn_log(struct remoting_dev_instance *instance, const char *format, ...) + PRINTFLIKE(2, 3); + + +inline void +INFO(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + + +struct virtgpu { + //struct vn_renderer base; + + struct remoting_dev_instance *instance; + + int fd; + + bool has_primary; + int primary_major; + int primary_minor; + int render_major; + int render_minor; + + int bustype; + drmPciBusInfo pci_bus_info; + + uint32_t max_timeline_count; + + struct { + enum virgl_renderer_capset id; + uint32_t version; + struct virgl_renderer_capset_venus data; + } capset; + + uint32_t shmem_blob_mem; + uint32_t bo_blob_mem; + + /* note that we use gem_handle instead of res_id to index because + * res_id is monotonically increasing by default (see + * virtio_gpu_resource_id_get) + */ + //struct util_sparse_array shmem_array; + // struct util_sparse_array bo_array; + + mtx_t dma_buf_import_mutex; + +// struct vn_renderer_shmem_cache shmem_cache; + + bool supports_cross_device; +}; + + +void create_virtgpu(); +static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); +static VkResult virtgpu_open(struct virtgpu *gpu); + + +static VkResult virtgpu_init_params(struct virtgpu *gpu); +static VkResult virtgpu_init_capset(struct virtgpu *gpu); +static VkResult virtgpu_init_context(struct virtgpu *gpu); + +static int virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id); +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size); +static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); +static void virtgpu_init_renderer_info(struct virtgpu *gpu); From 3ba78a5d3337eb93c0d1500f5bd4e071ea30094e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 10 Apr 2025 16:49:40 +0200 Subject: [PATCH 010/117] virtgpu: allocate a shared page with the host --- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 2 + .../src/ggml-remotingfrontend/virtgpu-shm.cpp | 107 +++++++++ ggml/src/ggml-remotingfrontend/virtgpu-shm.h | 37 ++++ .../ggml-remotingfrontend/virtgpu-utils.cpp | 186 ++++++++++++++++ .../src/ggml-remotingfrontend/virtgpu-utils.h | 50 +++++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 35 ++- ggml/src/ggml-remotingfrontend/virtgpu.h | 20 +- .../src/ggml-remotingfrontend/virtgpu_venus.c | 209 ++++++++++++++++++ 8 files changed, 628 insertions(+), 18 deletions(-) create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-shm.h create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-utils.h create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu_venus.c diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 63098a431b0a5..778fddd89a164 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -19,6 +19,8 @@ ggml_add_backend_library(ggml-remotingfrontend ggml-buffer-type.cpp ggml-host-buffer-type.cpp virtgpu.cpp + virtgpu-shm.cpp + virtgpu-utils.cpp ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp new file mode 100644 index 0000000000000..f027860407a4e --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -0,0 +1,107 @@ +#include + +#include "virtgpu-shm.h" + +static uint32_t +virtgpu_ioctl_resource_create_blob(struct virtgpu *gpu, + uint32_t blob_mem, + uint32_t blob_flags, + size_t blob_size, + uint64_t blob_id, + uint32_t *res_id) +{ +#ifdef SIMULATE_BO_SIZE_FIX + blob_size = align64(blob_size, 4096); +#endif + + struct drm_virtgpu_resource_create_blob args = { + .blob_mem = blob_mem, + .blob_flags = blob_flags, + .bo_handle = 0, + .res_handle = 0, + .size = blob_size, + .pad = 0, + .cmd_size = 0, + .cmd = 0, + .blob_id = blob_id, + }; + + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB, &args)) + return 0; + + *res_id = args.res_handle; + return args.bo_handle; +} + +static void +virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle) +{ + struct drm_gem_close args = { + .handle = gem_handle, + .pad = 0, + }; + + const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args); + assert(!ret); +} + +static void * +virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) +{ + struct drm_virtgpu_map args = { + .offset = 0, + .handle = gem_handle, + .pad = 0, + }; + printf("Say hello world\n"); + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) + return NULL; + + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, gpu->fd, + args.offset); + if (ptr == MAP_FAILED) + return NULL; + + return ptr; +} + +void +virtgpu_shmem_destroy(struct virtgpu *gpu, + struct virtgpu_shmem *shmem) +{ + munmap(shmem->base.mmap_ptr, shmem->base.mmap_size); + virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); +} + +struct vn_renderer_shmem * +virtgpu_shmem_create(struct virtgpu *gpu, size_t size) +{ + size = align64(size, 16384); + + uint32_t res_id; + uint32_t gem_handle = virtgpu_ioctl_resource_create_blob( + gpu, gpu->shmem_blob_mem, VIRTGPU_BLOB_FLAG_USE_MAPPABLE, size, 0, + &res_id); + if (!gem_handle) + return NULL; + + void *ptr = virtgpu_ioctl_map(gpu, gem_handle, size); + if (!ptr) { + virtgpu_ioctl_gem_close(gpu, gem_handle); + return NULL; + } + if (gpu->shmem_array.elem_size == 0) { + INFO("gpu->shmem_array.elem_size == 0 | Not working :/\n"); + assert(false); + } + struct virtgpu_shmem *shmem = (struct virtgpu_shmem *) util_sparse_array_get(&gpu->shmem_array, gem_handle); + + shmem->gem_handle = gem_handle; + shmem->base.res_id = res_id; + shmem->base.mmap_size = size; + shmem->base.mmap_ptr = ptr; + shmem->base.refcount.count = 1; + shmem->base.gem_handle = gem_handle; + + return &shmem->base; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h new file mode 100644 index 0000000000000..3bdc5ca700f1b --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "virtgpu.h" +#include "virtgpu-utils.h" + +struct vn_refcount { + int count; //atomic_int +}; + + +struct vn_renderer_shmem { + struct vn_refcount refcount; + + uint32_t res_id; + size_t mmap_size; /* for internal use only (i.e., munmap) */ + void *mmap_ptr; + + struct list_head cache_head; + int64_t cache_timestamp; + + uint32_t gem_handle; +}; + +struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size); +void virtgpu_shmem_destroy(struct virtgpu *gpu, struct virtgpu_shmem *shmem); + + +struct virtgpu_shmem { + struct vn_renderer_shmem base; + uint32_t gem_handle; +}; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp new file mode 100644 index 0000000000000..100f495add1bc --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -0,0 +1,186 @@ +#include "virtgpu-utils.h" +#include +#include +#include + +#define NODE_ALLOC_ALIGN 64 +#define NODE_PTR_MASK (~((uintptr_t)NODE_ALLOC_ALIGN - 1)) +#define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1) +#define NULL_NODE 0 + +#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) +#define os_free_aligned(_ptr) free(_ptr) +#define p_atomic_cmpxchg(v, old, _new) \ + __sync_val_compare_and_swap((v), (old), (_new)) + +static inline uint64_t +util_logbase2_64(uint64_t n) +{ +#if defined(HAVE___BUILTIN_CLZLL) + return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1)); +#else + uint64_t pos = 0ull; + if (n >= 1ull<<32) { n >>= 32; pos += 32; } + if (n >= 1ull<<16) { n >>= 16; pos += 16; } + if (n >= 1ull<< 8) { n >>= 8; pos += 8; } + if (n >= 1ull<< 4) { n >>= 4; pos += 4; } + if (n >= 1ull<< 2) { n >>= 2; pos += 2; } + if (n >= 1ull<< 1) { pos += 1; } + return pos; +#endif +} + +void +util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size) +{ + memset(arr, 0, sizeof(*arr)); + arr->elem_size = elem_size; + arr->node_size_log2 = util_logbase2_64(node_size); + assert(node_size >= 2 && node_size == (1ull << arr->node_size_log2)); +} + +static inline void * +os_malloc_aligned(size_t size, size_t alignment) +{ + void *ptr; + alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1); + if(posix_memalign(&ptr, alignment, size) != 0) + return NULL; + return ptr; +} + +static inline void * +_util_sparse_array_node_data(uintptr_t handle) +{ + return (void *)(handle & NODE_PTR_MASK); +} + +static inline unsigned +_util_sparse_array_node_level(uintptr_t handle) +{ + return handle & NODE_LEVEL_MASK; +} + +static inline void +_util_sparse_array_node_finish(struct util_sparse_array *arr, + uintptr_t node) +{ + if (_util_sparse_array_node_level(node) > 0) { + uintptr_t *children = (uintptr_t *) _util_sparse_array_node_data(node); + size_t node_size = 1ull << arr->node_size_log2; + for (size_t i = 0; i < node_size; i++) { + if (children[i]) + _util_sparse_array_node_finish(arr, children[i]); + } + } + + os_free_aligned(_util_sparse_array_node_data(node)); +} + +static inline uintptr_t +_util_sparse_array_node(void *data, unsigned level) +{ + assert(data != NULL); + assert(((uintptr_t)data & NODE_LEVEL_MASK) == 0); + assert((level & NODE_PTR_MASK) == 0); + return (uintptr_t)data | level; +} + +inline uintptr_t +_util_sparse_array_node_alloc(struct util_sparse_array *arr, + unsigned level) +{ + size_t size; + if (level == 0) { + size = arr->elem_size << arr->node_size_log2; + } else { + size = sizeof(uintptr_t) << arr->node_size_log2; + } + + void *data = os_malloc_aligned(size, NODE_ALLOC_ALIGN); + memset(data, 0, size); + + return _util_sparse_array_node(data, level); +} + +static inline uintptr_t +_util_sparse_array_set_or_free_node(uintptr_t *node_ptr, + uintptr_t cmp_node, + uintptr_t node) +{ + uintptr_t prev_node = p_atomic_cmpxchg(node_ptr, cmp_node, node); + + if (prev_node != cmp_node) { + /* We lost the race. Free this one and return the one that was already + * allocated. + */ + os_free_aligned(_util_sparse_array_node_data(node)); + return prev_node; + } else { + return node; + } +} + +void * +util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx) +{ + const unsigned node_size_log2 = arr->node_size_log2; + uintptr_t root = p_atomic_read(&arr->root); + if (unlikely(!root)) { + unsigned root_level = 0; + uint64_t idx_iter = idx >> node_size_log2; + while (idx_iter) { + idx_iter >>= node_size_log2; + root_level++; + } + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level); + root = _util_sparse_array_set_or_free_node(&arr->root, + NULL_NODE, new_root); + } + + while (1) { + unsigned root_level = _util_sparse_array_node_level(root); + uint64_t root_idx = idx >> (root_level * node_size_log2); + if (likely(root_idx < (1ull << node_size_log2))) + break; + + /* In this case, we have a root but its level is low enough that the + * requested index is out-of-bounds. + */ + uintptr_t new_root = _util_sparse_array_node_alloc(arr, root_level + 1); + + uintptr_t *new_root_children = (uintptr_t *) _util_sparse_array_node_data(new_root); + new_root_children[0] = root; + + /* We only add one at a time instead of the whole tree because it's + * easier to ensure correctness of both the tree building and the + * clean-up path. Because we're only adding one node we never have to + * worry about trying to free multiple things without freeing the old + * things. + */ + root = _util_sparse_array_set_or_free_node(&arr->root, root, new_root); + } + + void *node_data = _util_sparse_array_node_data(root); + unsigned node_level = _util_sparse_array_node_level(root); + while (node_level > 0) { + uint64_t child_idx = (idx >> (node_level * node_size_log2)) & + ((1ull << node_size_log2) - 1); + + uintptr_t *children = (uintptr_t *) node_data; + uintptr_t child = p_atomic_read(&children[child_idx]); + + if (unlikely(!child)) { + child = _util_sparse_array_node_alloc(arr, node_level - 1); + child = _util_sparse_array_set_or_free_node(&children[child_idx], + NULL_NODE, child); + } + + node_data = _util_sparse_array_node_data(child); + node_level = _util_sparse_array_node_level(child); + } + + uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1); + return (void *)((char *)node_data + (elem_idx * arr->elem_size)); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h new file mode 100644 index 0000000000000..b094b7b6347c6 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define likely(x) __builtin_expect(!!(x), 1) + +/** Checks is a value is a power of two. Does not handle zero. */ +#define IS_POT(v) (((v) & ((v) - 1)) == 0) + +/** Checks is a value is a power of two. Zero handled. */ +#define IS_POT_NONZERO(v) ((v) != 0 && IS_POT(v)) + +/** Align a value to a power of two */ +#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1)) + +#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) + + +static inline bool +util_is_power_of_two_nonzero64(uint64_t v) +{ + return IS_POT_NONZERO(v); +} + +static inline uint64_t +align64(uint64_t value, uint64_t alignment) +{ + assert(util_is_power_of_two_nonzero64(alignment)); + return ALIGN_POT(value, alignment); +} + +struct list_head +{ + struct list_head *prev; + struct list_head *next; +}; + +struct util_sparse_array { + size_t elem_size; + unsigned node_size_log2; + + uintptr_t root; +}; + +void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx); +void util_sparse_array_init(struct util_sparse_array *arr, + size_t elem_size, size_t node_size); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index f73be2767527d..408b34cba75e2 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include "virtgpu.h" @@ -30,23 +31,42 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; } +void breakpoint() { + // break here + INFO("BREAKPOINT HERE"); +} + void create_virtgpu() { struct virtgpu *gpu = new struct virtgpu(); + util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem), + 1024); + VkResult result = virtgpu_open(gpu); - GGML_ASSERT(result == VK_SUCCESS); + assert(result == VK_SUCCESS); result = virtgpu_init_params(gpu); - GGML_ASSERT(result == VK_SUCCESS); + assert(result == VK_SUCCESS); result = virtgpu_init_capset(gpu); - GGML_ASSERT(result == VK_SUCCESS); + assert(result == VK_SUCCESS); result = virtgpu_init_context(gpu); - GGML_ASSERT(result == VK_SUCCESS); + assert(result == VK_SUCCESS); virtgpu_init_shmem_blob_mem(gpu); + + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, 16384); + + if (!shmem) { + INFO("failed to enumerate DRM devices"); + assert(false); + } else { + INFO("Created shm at %p", shmem); + } + + breakpoint(); } static VkResult @@ -262,13 +282,6 @@ virtgpu_init_params(struct virtgpu *gpu) return VK_SUCCESS; } - -static int -virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) -{ - return drmIoctl(gpu->fd, request, args); -} - static int virtgpu_ioctl_context_init(struct virtgpu *gpu, enum virgl_renderer_capset capset_id) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 618fc5dc6e3b6..f7da4feaab08e 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -3,14 +3,17 @@ #include #include #include -#include #include #include #include #include #include -#include "ggml-remoting-frontend.h" +void breakpoint(); + +#include "virtgpu-shm.h" +#include "virtgpu-utils.h" + #define VIRGL_RENDERER_UNSTABLE_APIS 1 #include "drm-uapi/virtgpu_drm.h" #include "virglrenderer_hw.h" @@ -107,10 +110,7 @@ INFO(const char *format, ...) { va_end(argptr); } - struct virtgpu { - //struct vn_renderer base; - struct remoting_dev_instance *instance; int fd; @@ -139,17 +139,23 @@ struct virtgpu { * res_id is monotonically increasing by default (see * virtio_gpu_resource_id_get) */ - //struct util_sparse_array shmem_array; + struct util_sparse_array shmem_array; // struct util_sparse_array bo_array; mtx_t dma_buf_import_mutex; -// struct vn_renderer_shmem_cache shmem_cache; + // struct virtgpu_shmem_cache shmem_cache; bool supports_cross_device; }; +static inline int +virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) +{ + return drmIoctl(gpu->fd, request, args); +} + void create_virtgpu(); static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); static VkResult virtgpu_open(struct virtgpu *gpu); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c new file mode 100644 index 0000000000000..fc401c13d3003 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c @@ -0,0 +1,209 @@ +static inline void vn_encode_vkEnumeratePhysicalDevices(struct vn_cs_encoder *enc, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices) +{ + const VkCommandTypeEXT cmd_type = VK_COMMAND_TYPE_vkEnumeratePhysicalDevices_EXT; + + vn_encode_VkCommandTypeEXT(enc, &cmd_type); + vn_encode_VkFlags(enc, &cmd_flags); + + vn_encode_VkInstance(enc, &instance); + if (vn_encode_simple_pointer(enc, pPhysicalDeviceCount)) + vn_encode_uint32_t(enc, pPhysicalDeviceCount); + if (pPhysicalDevices) { + vn_encode_array_size(enc, (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0)); + for (uint32_t i = 0; i < (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0); i++) + vn_encode_VkPhysicalDevice(enc, &pPhysicalDevices[i]); + } else { + vn_encode_array_size(enc, 0); + } +} + +static inline struct vn_cs_encoder * +vn_ring_submit_command_init(struct vn_ring *ring, + struct vn_ring_submit_command *submit, + void *cmd_data, + size_t cmd_size, + size_t reply_size) +{ + submit->buffer = VN_CS_ENCODER_BUFFER_INITIALIZER(cmd_data); + submit->command = VN_CS_ENCODER_INITIALIZER(&submit->buffer, cmd_size); + + submit->reply_size = reply_size; + submit->reply_shmem = NULL; + + submit->ring_seqno_valid = false; + + return &submit->command; +} + +static inline void vn_submit_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices, struct vn_ring_submit_command *submit) +{ + uint8_t local_cmd_data[VN_SUBMIT_LOCAL_CMD_SIZE]; + void *cmd_data = local_cmd_data; + size_t cmd_size = vn_sizeof_vkEnumeratePhysicalDevices(instance, pPhysicalDeviceCount, pPhysicalDevices); + if (cmd_size > sizeof(local_cmd_data)) { + cmd_data = malloc(cmd_size); + if (!cmd_data) + cmd_size = 0; + } + const size_t reply_size = cmd_flags & VK_COMMAND_GENERATE_REPLY_BIT_EXT ? vn_sizeof_vkEnumeratePhysicalDevices_reply(instance, pPhysicalDeviceCount, pPhysicalDevices) : 0; + + struct vn_cs_encoder *enc = vn_ring_submit_command_init(vn_ring, submit, cmd_data, cmd_size, reply_size); + if (cmd_size) { + vn_encode_vkEnumeratePhysicalDevices(enc, cmd_flags, instance, pPhysicalDeviceCount, pPhysicalDevices); + vn_ring_submit_command(vn_ring, submit); + if (cmd_data != local_cmd_data) + free(cmd_data); + } +} + +VkResult vn_call_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices) +{ + VN_TRACE_FUNC(); + + struct vn_ring_submit_command submit; + vn_submit_vkEnumeratePhysicalDevices(vn_ring, VK_COMMAND_GENERATE_REPLY_BIT_EXT, instance, pPhysicalDeviceCount, pPhysicalDevices, &submit); + struct vn_cs_decoder *dec = vn_ring_get_command_reply(vn_ring, &submit); + if (dec) { + const VkResult ret = vn_decode_vkEnumeratePhysicalDevices_reply(dec, instance, pPhysicalDeviceCount, pPhysicalDevices); + vn_ring_free_command_reply(vn_ring, &submit); + return ret; + } else { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } +} + +VkResult +vn_ring_submit_command_simple(struct vn_ring *ring, + const struct vn_cs_encoder *cs) +{ + mtx_lock(&ring->mutex); + VkResult result = vn_ring_submit_locked(ring, cs, NULL, NULL); + mtx_unlock(&ring->mutex); + + return result; +} + +static VkResult +vn_ring_submit_locked(struct vn_ring *ring, + const struct vn_cs_encoder *cs, + struct vn_renderer_shmem *extra_shmem, + uint32_t *ring_seqno) +{ + const bool direct = vn_ring_submission_can_direct(ring, cs); + if (!direct && cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER) { + cs = vn_ring_cs_upload_locked(ring, cs); + if (!cs) + return VK_ERROR_OUT_OF_HOST_MEMORY; + assert(cs->storage_type != VN_CS_ENCODER_STORAGE_POINTER); + } + + struct vn_ring_submission submit; + VkResult result = + vn_ring_submission_prepare(ring, &submit, cs, extra_shmem, direct); + if (result != VK_SUCCESS) + return result; + + uint32_t seqno; + const bool notify = + vn_ring_submit_internal(ring, submit.submit, submit.cs, &seqno); + if (notify) { + uint32_t notify_ring_data[8]; + struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL( + notify_ring_data, sizeof(notify_ring_data)); + vn_encode_vkNotifyRingMESA(&local_enc, 0, ring->id, seqno, 0); + vn_renderer_submit_simple(ring->instance->renderer, notify_ring_data, + vn_cs_encoder_get_len(&local_enc)); + } + + vn_ring_submission_cleanup(&submit); + + if (ring_seqno) + *ring_seqno = seqno; + + return VK_SUCCESS; +} + +static VkResult +vn_ring_submission_prepare(struct vn_ring *ring, + struct vn_ring_submission *submit, + const struct vn_cs_encoder *cs, + struct vn_renderer_shmem *extra_shmem, + bool direct) +{ + submit->cs = vn_ring_submission_get_cs(submit, cs, direct); + if (!submit->cs) + return VK_ERROR_OUT_OF_HOST_MEMORY; + + submit->submit = + vn_ring_submission_get_ring_submit(ring, cs, extra_shmem, direct); + if (!submit->submit) { + vn_ring_submission_cleanup(submit); + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + return VK_SUCCESS; +} + +static bool +vn_ring_submit_internal(struct vn_ring *ring, + struct vn_ring_submit *submit, + const struct vn_cs_encoder *cs, + uint32_t *seqno) +{ + /* write cs to the ring */ + assert(!vn_cs_encoder_is_empty(cs)); + + /* avoid -Wmaybe-unitialized */ + uint32_t cur_seqno = 0; + + for (uint32_t i = 0; i < cs->buffer_count; i++) { + const struct vn_cs_encoder_buffer *buf = &cs->buffers[i]; + cur_seqno = vn_ring_wait_space(ring, buf->committed_size); + vn_ring_write_buffer(ring, buf->base, buf->committed_size); + } + + vn_ring_store_tail(ring); + const VkRingStatusFlagsMESA status = vn_ring_load_status(ring); + if (status & VK_RING_STATUS_FATAL_BIT_MESA) { + vn_log(NULL, "vn_ring_submit abort on fatal"); + abort(); + } + + vn_ring_retire_submits(ring, cur_seqno); + + submit->seqno = ring->cur; + list_addtail(&submit->head, &ring->submits); + + *seqno = submit->seqno; + + /* Notify renderer to wake up idle ring if at least VN_RING_IDLE_TIMEOUT_NS + * has passed since the last sent notification to avoid excessive wake up + * calls (non-trivial since submitted via virtio-gpu kernel). + */ + if (status & VK_RING_STATUS_IDLE_BIT_MESA) { + const int64_t now = os_time_get_nano(); + if (os_time_timeout(ring->last_notify, ring->next_notify, now)) { + ring->last_notify = now; + ring->next_notify = now + VN_RING_IDLE_TIMEOUT_NS; + return true; + } + } + return false; +} + +static void +vn_ring_write_buffer(struct vn_ring *ring, const void *data, uint32_t size) +{ + assert(ring->cur + size - vn_ring_load_head(ring) <= ring->buffer_size); + + const uint32_t offset = ring->cur & ring->buffer_mask; + if (offset + size <= ring->buffer_size) { + memcpy(ring->shared.buffer + offset, data, size); + } else { + const uint32_t s = ring->buffer_size - offset; + memcpy(ring->shared.buffer + offset, data, s); + memcpy(ring->shared.buffer, data + s, size - s); + } + + ring->cur += size; +} From a87e39888c4c58e2dc00e721bdb4acea2de20684 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 11 Apr 2025 09:19:24 +0200 Subject: [PATCH 011/117] run.remoting: cleanup the screen before running --- run.remoting.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.remoting.sh b/run.remoting.sh index c6fbdaac435a5..b7175a78aab4c 100755 --- a/run.remoting.sh +++ b/run.remoting.sh @@ -1,5 +1,5 @@ #! /bin/bash - +clear if [[ ${1:-} == "gdb" ]]; then prefix="gdb --args" else From 3d7b19d64e12a0cc3c732904671a251aff1f8ac8 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 11 Apr 2025 09:19:40 +0200 Subject: [PATCH 012/117] Reduce the verbose logging --- src/llama-context.cpp | 4 ++-- src/llama-model-loader.cpp | 5 +++-- src/llama-model.cpp | 5 +++-- src/llama-vocab.cpp | 2 ++ 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4735e98ea040f..8144ba4ebeae7 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -96,7 +96,7 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; - +/* LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq); @@ -106,7 +106,7 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - +*/ if (n_ctx_per_seq < hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", __func__, n_ctx_per_seq, hparams.n_ctx_train); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 36f8d1cbf0323..bb8b090950072 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -651,7 +651,7 @@ llama_model_loader::llama_model_loader( } } - LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + //LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(meta.get(), i); @@ -677,7 +677,7 @@ llama_model_loader::llama_model_loader( continue; } - LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + //LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -1119,6 +1119,7 @@ std::string llama_model_loader::ftype_name() const { } void llama_model_loader::print_info() const { + return; LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver)); LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str()); if (n_bytes < GiB) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9e4166a71c641..a431c81996bc9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1458,12 +1458,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); + //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); + //LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); return {dev, &pimpl->gpu_buft_list.at(dev)}; }; @@ -4144,6 +4144,7 @@ uint64_t llama_model::n_elements() const { } void llama_model::print_info() const { + return; const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train); auto print_f = [](const std::function & f, uint32_t n) { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a9c24e78812ac..f454e2aa895b5 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2731,6 +2731,7 @@ int32_t llama_vocab::impl::detokenize( } void llama_vocab::impl::print_info() const { + return; LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str()); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens()); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size()); @@ -3055,6 +3056,7 @@ std::string llama_vocab::detokenize(const std::vector & tokens, boo } void llama_vocab::print_info() const { + return; pimpl->print_info(); } From 4419c955deeb2708e988f93974d67b9b32779f1b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 2 May 2025 09:27:46 +0200 Subject: [PATCH 013/117] Trace the executionpath --- ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp index f027860407a4e..bd1568add1752 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -53,7 +53,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) .handle = gem_handle, .pad = 0, }; - printf("Say hello world\n"); + printf("virtgpu_ioctl_map(%ld)\n", size); if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) return NULL; @@ -61,7 +61,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) args.offset); if (ptr == MAP_FAILED) return NULL; - + printf("virtgpu_ioctl_map(%ld) --> %p | %p\n", size, ptr, *(void **)ptr); return ptr; } From a25b672a5b737c43f7bda05865ca9fe560119ab4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 2 May 2025 09:28:42 +0200 Subject: [PATCH 014/117] virtgpu: abort early --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 408b34cba75e2..c8be37bc57301 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -3,6 +3,8 @@ #include #include +#include + #include "virtgpu.h" static inline void @@ -31,9 +33,13 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; } -void breakpoint() { +void *something = NULL; +void breakpoint () { // break here INFO("BREAKPOINT HERE"); + if (!something) { // avoid the [[noreturn]] detection mechanism + exit(0); + } } void From 5febf22ee49e55a2d56c43c3420e4fbdc231c6da Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 2 May 2025 09:29:10 +0200 Subject: [PATCH 015/117] virtgpu: add the virtgpu_submit to kick a command on the host --- .../src/ggml-remotingfrontend/virtgpu-utils.h | 12 +++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 86 +++++++++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.h | 1 + 3 files changed, 99 insertions(+) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index b094b7b6347c6..7bea1798f0ebb 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #define unlikely(x) __builtin_expect(!!(x), 0) #define likely(x) __builtin_expect(!!(x), 1) @@ -48,3 +51,12 @@ struct util_sparse_array { void *util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx); void util_sparse_array_init(struct util_sparse_array *arr, size_t elem_size, size_t node_size); + +inline void +os_time_sleep(int64_t usecs) +{ + struct timespec time; + time.tv_sec = usecs / 1000000; + time.tv_nsec = (usecs % 1000000) * 1000; + while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index c8be37bc57301..55722e6eb8fa0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -72,6 +72,8 @@ create_virtgpu() { INFO("Created shm at %p", shmem); } + virtgpu_submit(gpu, shmem); + breakpoint(); } @@ -347,3 +349,87 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_GETPARAM, &args); return ret ? 0 : val; } + + + +#define PK_COMMAND_TYPE_pkCreateThread 255 + +static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) +{ + + /* + * Data passed to the host + */ + int32_t command[3]; + // command identifier + command[0] = PK_COMMAND_TYPE_pkCreateThread; + command[1] = 0; // ? + // arguments + command[2] = shmem->res_id; + + /* + * Reply notification pointer + */ + + volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) shmem->mmap_ptr; + *atomic_reply_notif = 0; + + /* + * Trigger the execbuf ioctl + */ + + struct drm_virtgpu_execbuffer args = { + .flags = VIRTGPU_EXECBUF_RING_IDX, + .size = sizeof(command), + .command = (uintptr_t) &command, + + .bo_handles = 0, + .num_bo_handles = 0, + + .fence_fd = 0, + .ring_idx = 0, + .syncobj_stride = 0, + .num_in_syncobjs = 0, + .num_out_syncobjs = 0, + .in_syncobjs = 0, + .out_syncobjs = 0, + }; + + int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); + + /* + * Wait for the response notification + */ + + int resp = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire); + printf("waiting for the response ... | %d | %p\n", resp, (void*) atomic_reply_notif); + + while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) { + int64_t base_sleep_us = 160; + + os_time_sleep(base_sleep_us); + } + printf("got the response!\n"); + /* + * Read the reply + */ + + printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[1]); + printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[2]); + printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[3]); + +#if 0 + VkCommandTypeEXT command_type; + vn_decode_VkCommandTypeEXT(dec, &command_type); + assert(command_type == VK_COMMAND_TYPE_vkEnumerateInstanceVersion_EXT); + VkResult ret; + vn_decode_VkResult(dec, &ret); + if (vn_decode_simple_pointer(dec)) { + vn_decode_uint32_t(dec, pApiVersion); + } else { + pApiVersion = NULL; + } +#endif + + return ret; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index f7da4feaab08e..66c40a05b4909 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -175,3 +175,4 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu, size_t capset_size); static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); static void virtgpu_init_renderer_info(struct virtgpu *gpu); +static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem); From 2ee2a4d4acf1b02e226e1c2dc01c91ce7860fd8f Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 2 May 2025 12:09:58 +0200 Subject: [PATCH 016/117] podman_compile.sh: add compile helper --- podman_compile.sh | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100755 podman_compile.sh diff --git a/podman_compile.sh b/podman_compile.sh new file mode 100755 index 0000000000000..47e4baee07037 --- /dev/null +++ b/podman_compile.sh @@ -0,0 +1,34 @@ +#! /bin/bash + + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +opts="" +opts="$opts --device /dev/dri " +echo "Running with the GPU passthrough" + +image=localhost/pytorch:remoting + +what=${1:-} +if [[ -z "$what" ]]; then + what=remoting +fi + +cmd="bash ./build.$what.sh" + +set -x +podman run \ +--name mac_ai_compiling \ +--user root:root \ +--cgroupns host \ +--security-opt label=disable \ +--env HOME="$HOME" \ +-v "$HOME":"$HOME":Z \ +-w "$PWD" \ +-it --rm \ +$opts \ +$image \ +$cmd From 847a2adff36abbc6ecdabeff5ae01145c8a26985 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 5 May 2025 16:07:57 +0200 Subject: [PATCH 017/117] virtgpu: move the logging functions to virtgpu-utils --- .../src/ggml-remotingfrontend/virtgpu-utils.h | 23 +++++++++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.h | 9 -------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index 7bea1798f0ebb..9d1589c9128ab 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include #define unlikely(x) __builtin_expect(!!(x), 0) #define likely(x) __builtin_expect(!!(x), 1) @@ -21,6 +24,26 @@ #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) +inline void +INFO(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +FATAL(const char *format, ...) { + fprintf(stderr, "FATAL: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); + exit(1); +} static inline bool util_is_power_of_two_nonzero64(uint64_t v) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 66c40a05b4909..03e9b97b84173 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -101,15 +101,6 @@ vn_log(struct remoting_dev_instance *instance, const char *format, ...) PRINTFLIKE(2, 3); -inline void -INFO(const char *format, ...) { - va_list argptr; - va_start(argptr, format); - vfprintf(stderr, format, argptr); - fprintf(stderr, "\n"); - va_end(argptr); -} - struct virtgpu { struct remoting_dev_instance *instance; From 3270cf9c399f1f9dfdf4a0d1bbbb552f9bbb45ea Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 5 May 2025 16:08:31 +0200 Subject: [PATCH 018/117] virtgpu: use venus CS functions --- .../src/ggml-remotingfrontend/virtgpu-types.h | 298 ++++++++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 74 +++-- 2 files changed, 343 insertions(+), 29 deletions(-) create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-types.h diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-types.h b/ggml/src/ggml-remotingfrontend/virtgpu-types.h new file mode 100644 index 0000000000000..b0802ad634bcb --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-types.h @@ -0,0 +1,298 @@ +#pragma once +#include "virtgpu.h" + +struct vn_cs_encoder { + char* cur; + const char* end; +}; + +struct vn_cs_decoder { + const char* cur; + const char* end; +}; + +/* + * encode peek + */ + +static inline bool +vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + assert(val_size <= size); + + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("DECODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; +} + +static inline void +vn_cs_decoder_peek(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + vn_cs_decoder_peek_internal(dec, size, val, val_size); +} + +/* + * read/write + */ + +static inline void +vn_cs_decoder_read(struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) + dec->cur += size; +} + +static inline void +vn_cs_encoder_write(struct vn_cs_encoder *enc, + size_t size, + const void *val, + size_t val_size) +{ + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(enc->cur, val, val_size); + enc->cur += size; +} + +/* + * encode/decode + */ + +static inline void +vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) +{ + assert(size % 4 == 0); + vn_cs_decoder_read(dec, size, data, data_size); +} + +static inline void +vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) +{ + assert(size % 4 == 0); + /* TODO check if the generated code is optimal */ + vn_cs_encoder_write(enc, size, data, data_size); +} + +/* + * typed encode/decode + */ + +/* uint64_t */ + +static inline size_t +vn_sizeof_uint64_t(const uint64_t *val) +{ + assert(sizeof(*val) == 8); + return 8; +} + +static inline void +vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) +{ + vn_encode(enc, 8, val, sizeof(*val)); +} + +static inline void +vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) +{ + vn_decode(dec, 8, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) +{ + assert(sizeof(*val) == 8); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* int32_t */ + +static inline size_t +vn_sizeof_int32_t(const int32_t *val) +{ + assert(sizeof(*val) == 4); + return 4; +} + +static inline void +vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* array size (uint64_t) */ + +static inline size_t +vn_sizeof_array_size(uint64_t size) +{ + return vn_sizeof_uint64_t(&size); +} + +static inline void +vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) +{ + vn_encode_uint64_t(enc, &size); +} + +static inline uint64_t +vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + if (size != expected_size) { + FATAL("ENCODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + size = 0; + } + return size; +} + +static inline uint64_t +vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + return size; +} + +static inline uint64_t +vn_peek_array_size(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); + return size; +} + +/* non-array pointer */ + +static inline size_t +vn_sizeof_simple_pointer(const void *val) +{ + return vn_sizeof_array_size(val ? 1 : 0); +} + +static inline bool +vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) +{ + vn_encode_array_size(enc, val ? 1 : 0); + return val; +} + +static inline bool +vn_decode_simple_pointer(struct vn_cs_decoder *dec) +{ + return vn_decode_array_size_unchecked(dec); +} + +/* uint32_t */ + +static inline size_t +vn_sizeof_uint32_t(const uint32_t *val) +{ + assert(sizeof(*val) == 4); + return 4; +} + +static inline void +vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 55722e6eb8fa0..37bf98a6e8bb5 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -6,6 +6,7 @@ #include #include "virtgpu.h" +#include "virtgpu-types.h" static inline void virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) @@ -358,14 +359,31 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) { /* - * Data passed to the host + * Prepare the command encoder buffer */ - int32_t command[3]; - // command identifier - command[0] = PK_COMMAND_TYPE_pkCreateThread; - command[1] = 0; // ? - // arguments - command[2] = shmem->res_id; + + char encoder_buffer[4096]; + + struct vn_cs_encoder _encoder = { + encoder_buffer, + encoder_buffer + sizeof(encoder_buffer), + }; + struct vn_cs_encoder *encoder = &_encoder; + + /* + * Fill the command encoder buffer + */ + + /* VkCommandTypeEXT is int32_t */ + int32_t cmd_type = PK_COMMAND_TYPE_pkCreateThread; + vn_encode_int32_t(encoder, &cmd_type); + int32_t cmd_flags = 0x0; + vn_encode_int32_t(encoder, &cmd_flags); + + uint32_t reply_res_id = shmem->res_id; + vn_encode_uint32_t(encoder, &reply_res_id); + + printf("call pkCreateThread(flags=0x%x, reply_buf=%d)\n", cmd_flags, reply_res_id); /* * Reply notification pointer @@ -380,8 +398,8 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) struct drm_virtgpu_execbuffer args = { .flags = VIRTGPU_EXECBUF_RING_IDX, - .size = sizeof(command), - .command = (uintptr_t) &command, + .size = sizeof(encoder_buffer), + .command = (uintptr_t) encoder_buffer, .bo_handles = 0, .num_bo_handles = 0, @@ -401,35 +419,33 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) * Wait for the response notification */ - int resp = std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire); - printf("waiting for the response ... | %d | %p\n", resp, (void*) atomic_reply_notif); - while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) { int64_t base_sleep_us = 160; os_time_sleep(base_sleep_us); } - printf("got the response!\n"); + /* * Read the reply */ - printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[1]); - printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[2]); - printf("virtgpu_submit() --> 0x%x\n", ((uint32_t *)shmem->mmap_ptr)[3]); - -#if 0 - VkCommandTypeEXT command_type; - vn_decode_VkCommandTypeEXT(dec, &command_type); - assert(command_type == VK_COMMAND_TYPE_vkEnumerateInstanceVersion_EXT); - VkResult ret; - vn_decode_VkResult(dec, &ret); - if (vn_decode_simple_pointer(dec)) { - vn_decode_uint32_t(dec, pApiVersion); - } else { - pApiVersion = NULL; - } -#endif + struct vn_cs_decoder _dec = { + .cur = (char *) shmem->mmap_ptr + sizeof(*atomic_reply_notif), + .end = (char *) shmem->mmap_ptr + shmem->mmap_size, + }; + struct vn_cs_decoder *dec = &_dec; + + uint32_t apiVersion; + vn_decode_uint32_t(dec, &apiVersion); + printf("pkCreateThread() --> 0x%x\n", apiVersion); + vn_decode_uint32_t(dec, &apiVersion); + printf("pkCreateThread() --> 0x%x\n", apiVersion); + vn_decode_uint32_t(dec, &apiVersion); + printf("pkCreateThread() --> 0x%x\n", apiVersion); + + int32_t vk_ret; + vn_decode_int32_t(dec, &vk_ret); + printf("pkCreateThread() --> ret=%d\n", vk_ret); return ret; } From 151c0ae893564b88a512ec21cfc6fd41cc7e3d46 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 10:02:22 +0200 Subject: [PATCH 019/117] virtgpu: make more generic --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 63 +++++++++++----------- ggml/src/ggml-remotingfrontend/virtgpu.h | 19 ++++++- 2 files changed, 49 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 37bf98a6e8bb5..e251b29577616 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -35,9 +35,9 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) } void *something = NULL; -void breakpoint () { +void thks_bye () { // break here - INFO("BREAKPOINT HERE"); + INFO("thks bye, stopping early."); if (!something) { // avoid the [[noreturn]] detection mechanism exit(0); } @@ -64,18 +64,17 @@ create_virtgpu() { virtgpu_init_shmem_blob_mem(gpu); - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, 16384); + gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384); - if (!shmem) { - INFO("failed to enumerate DRM devices"); + if (!gpu->reply_shmem) { + FATAL("%s: failed to create the reply shared memory page :/", __func__); assert(false); - } else { - INFO("Created shm at %p", shmem); } - virtgpu_submit(gpu, shmem); + remote_call(gpu, PK_COMMAND_TYPE_LoadLibrary, 0); + remote_call(gpu, PK_COMMAND_TYPE_SayHello, 0); - breakpoint(); + thks_bye(); } static VkResult @@ -352,10 +351,11 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) } - -#define PK_COMMAND_TYPE_pkCreateThread 255 - -static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) +static int remote_call( + struct virtgpu *gpu, + int32_t cmd_type, + int32_t cmd_flags + ) { /* @@ -375,21 +375,25 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) */ /* VkCommandTypeEXT is int32_t */ - int32_t cmd_type = PK_COMMAND_TYPE_pkCreateThread; vn_encode_int32_t(encoder, &cmd_type); - int32_t cmd_flags = 0x0; vn_encode_int32_t(encoder, &cmd_flags); - uint32_t reply_res_id = shmem->res_id; + if (!gpu->reply_shmem) { + FATAL("%s: the reply shmem page can't be null", __func__); + } + + uint32_t reply_res_id = gpu->reply_shmem->res_id; vn_encode_uint32_t(encoder, &reply_res_id); - printf("call pkCreateThread(flags=0x%x, reply_buf=%d)\n", cmd_flags, reply_res_id); + printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__, + command_name(cmd_type), + cmd_flags, reply_res_id); /* * Reply notification pointer */ - volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) shmem->mmap_ptr; + volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr; *atomic_reply_notif = 0; /* @@ -415,6 +419,9 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); + if (ret != 0) { + FATAL("%s: the virtgpu EXECBUFFER ioctl failed (%d) :/ \n", ret); + } /* * Wait for the response notification */ @@ -430,22 +437,16 @@ static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem) */ struct vn_cs_decoder _dec = { - .cur = (char *) shmem->mmap_ptr + sizeof(*atomic_reply_notif), - .end = (char *) shmem->mmap_ptr + shmem->mmap_size, + .cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif), + .end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size, }; struct vn_cs_decoder *dec = &_dec; - uint32_t apiVersion; - vn_decode_uint32_t(dec, &apiVersion); - printf("pkCreateThread() --> 0x%x\n", apiVersion); - vn_decode_uint32_t(dec, &apiVersion); - printf("pkCreateThread() --> 0x%x\n", apiVersion); - vn_decode_uint32_t(dec, &apiVersion); - printf("pkCreateThread() --> 0x%x\n", apiVersion); + int32_t rmt_call_ret; + vn_decode_int32_t(dec, &rmt_call_ret); - int32_t vk_ret; - vn_decode_int32_t(dec, &vk_ret); - printf("pkCreateThread() --> ret=%d\n", vk_ret); + printf("%s: call %s() --> %d\n", __func__, + command_name(cmd_type), rmt_call_ret); - return ret; + return rmt_call_ret; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 03e9b97b84173..f3249207d85ad 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -9,7 +9,7 @@ #include #include -void breakpoint(); +void thks_bye(); #include "virtgpu-shm.h" #include "virtgpu-utils.h" @@ -138,6 +138,9 @@ struct virtgpu { // struct virtgpu_shmem_cache shmem_cache; bool supports_cross_device; + + /* KP */ + struct vn_renderer_shmem *reply_shmem; }; @@ -166,4 +169,16 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu, size_t capset_size); static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); static void virtgpu_init_renderer_info(struct virtgpu *gpu); -static int virtgpu_submit(struct virtgpu *gpu, struct vn_renderer_shmem *shmem); +static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags); + +#define PK_COMMAND_TYPE_LoadLibrary 255 +#define PK_COMMAND_TYPE_SayHello 256 + +static inline const char *command_name(int32_t type) +{ + switch (type) { + case PK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; + case PK_COMMAND_TYPE_SayHello: return "SayHello"; + default: return "unknown"; + } +} From 52d8e4220cf5b2d2489e55b1ba3bf79efd03a063 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 10:28:28 +0200 Subject: [PATCH 020/117] ggml-remotingfrontend: fix and make more generic --- ggml/CMakeLists.txt | 2 +- ggml/include/ggml-remoting-frontend.h | 4 ++-- ggml/src/ggml-backend-reg.cpp | 2 +- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 10 +--------- ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp | 6 +++--- ggml/src/ggml-remotingfrontend/ggml-backend.cpp | 2 +- .../ggml-remotingfrontend/ggml-host-buffer-type.cpp | 4 ++-- 7 files changed, 11 insertions(+), 19 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 24c47aea122a2..6db2c2ee3f2f5 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -270,7 +270,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h - include/ggml-remoting-frontend.h + ggml/include/ggml-remoting-frontend.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-remoting-frontend.h b/ggml/include/ggml-remoting-frontend.h index c32c283820dea..4c7cd585ea4af 100644 --- a/ggml/include/ggml-remoting-frontend.h +++ b/ggml/include/ggml-remoting-frontend.h @@ -7,9 +7,9 @@ extern "C" { #endif -#define GGML_REMOTING_NAME "RemotingFrontend" +#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend" -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_reg(); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_frontend_reg(); #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 8ed3c36362bcd..45843e5ad190a 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -177,7 +177,7 @@ struct ggml_backend_registry { register_backend(ggml_backend_vk_reg()); #endif #ifdef GGML_USE_REMOTINGFRONTEND - register_backend(ggml_backend_remoting_reg()); + register_backend(ggml_backend_remoting_frontend_reg()); #endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 778fddd89a164..678623f972fc1 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -1,15 +1,7 @@ cmake_minimum_required(VERSION 3.19) cmake_policy(SET CMP0114 NEW) -# function(detect_host_compiler) -# find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH) -# find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH) - -# set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE) -# set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE) -# endfunction() - -message(STATUS "Enable API Remoting frontend found") +message(STATUS "Enable API Remoting frontend") ggml_add_backend_library(ggml-remotingfrontend ggml-backend-buffer.cpp diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 00dddf23f2898..cb77a31a037c8 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -29,7 +29,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ char desc[256] = "API Remoting device"; ctx->device = i; - ctx->name = GGML_REMOTING_NAME + std::to_string(i); + ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); ctx->description = desc; devices.push_back(new ggml_backend_device { /* .iface = */ ggml_backend_remoting_device_i, @@ -47,7 +47,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { UNUSED(reg); - return GGML_REMOTING_NAME; + return GGML_REMOTING_FRONTEND_NAME; } static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { @@ -57,7 +57,7 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { /* .get_proc_address = */ NULL, }; -ggml_backend_reg_t ggml_backend_remoting_reg() { +ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, /* .iface = */ ggml_backend_remoting_reg_i, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 2618e48929cba..aac17a762ff9b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -46,7 +46,7 @@ ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const c ggml_backend_t remoting_backend = new ggml_backend { /* .guid = */ ggml_backend_remoting_guid(), /* .interface = */ ggml_backend_remoting_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), ctx->device), + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device), /* .context = */ ctx, }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp index b40c72b8d1e8b..fbf5569788c40 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp @@ -3,7 +3,7 @@ // host buffer type static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_REMOTING_NAME "_Host"; + return GGML_REMOTING_FRONTEND_NAME "_Host"; UNUSED(buft); } @@ -43,7 +43,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_reg(), 0), + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0), /* .context = */ nullptr, }; From d118515e0600a4cbb38d61cc6201d59e7d3f933f Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 10:29:30 +0200 Subject: [PATCH 021/117] prepare.backend.sh: helper script --- prepare.backend.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 prepare.backend.sh diff --git a/prepare.backend.sh b/prepare.backend.sh new file mode 100755 index 0000000000000..a51f2465b6733 --- /dev/null +++ b/prepare.backend.sh @@ -0,0 +1,5 @@ +cmake -S . -B ../build.remoting-backend \ + -DGGML_REMOTINGBACKEND=ON \ + -DGGML_NATIVE=OFF \ + -DCMAKE_BUILD_TYPE=Debug \ + "$@" From a54229d40b8f675f03a639036447f01adf1a2796 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 10:30:00 +0200 Subject: [PATCH 022/117] build.backend.sh: helper script --- build.backend.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100755 build.backend.sh diff --git a/build.backend.sh b/build.backend.sh new file mode 100755 index 0000000000000..b32c24b9ba035 --- /dev/null +++ b/build.backend.sh @@ -0,0 +1,13 @@ +# force isatty-->true, so that $0 |& head -50 has colors ... +rm -f READY_backend FAILED_backend + +echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc - +export LD_PRELOAD=/tmp/isatty.so + +cmake --build ../build.remoting-backend --parallel 8 --target llama-cli "$@" + +if [[ $? == 0 ]]; then + touch READY_backend +else + touch FAILED_backend +fi From 78d16d047b985c3062e4fe045b2491bd695b5e0b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 10:28:10 +0200 Subject: [PATCH 023/117] build: integrate the remoting-backend skeleton --- CMakePresets.json | 1 + Makefile | 8 ++ ggml/CMakeLists.txt | 2 + ggml/include/ggml-remoting-backend.h | 16 ++++ ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 ++ ggml/src/ggml-remotingbackend/CMakeLists.txt | 11 +++ .../ggml-remotingbackend/backend-internal.h | 30 +++++++ ggml/src/ggml-remotingbackend/backend.cpp | 78 +++++++++++++++++++ 9 files changed, 154 insertions(+) create mode 100644 ggml/include/ggml-remoting-backend.h create mode 100644 ggml/src/ggml-remotingbackend/CMakeLists.txt create mode 100644 ggml/src/ggml-remotingbackend/backend-internal.h create mode 100644 ggml/src/ggml-remotingbackend/backend.cpp diff --git a/CMakePresets.json b/CMakePresets.json index c5369a47f6bf9..5296aae76e74a 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -31,6 +31,7 @@ { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, { "name": "remoting_frontend", "hidden": true, "cacheVariables": { "GGML_REMOTING_FRONTEND": "ON" } }, + { "name": "remoting_backend", "hidden": true, "cacheVariables": { "GGML_REMOTING_BACKEND": "ON" } }, { "name": "x64-windows-llvm", "hidden": true, diff --git a/Makefile b/Makefile index ebf9f79ed5598..18d73ae9de685 100644 --- a/Makefile +++ b/Makefile @@ -721,6 +721,11 @@ ifdef GGML_REMOTING_FRONTEND OBJ_GGML_EXT += ggml/src/ggml-remotingfrontend/ggml-remoting-frontend.o endif +ifdef GGML_REMOTING_BACKEND + MK_CPPFLAGS += -DGGML_USE_REMOTINGBACKEND + OBJ_GGML_EXT += ggml/src/ggml-remotingbackend/ggml-remoting-backend.o +endif + ifdef GGML_VULKAN MK_CPPFLAGS += -DGGML_USE_VULKAN MK_LDFLAGS += $(shell pkg-config --libs vulkan) @@ -763,6 +768,9 @@ ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-v ggml/src/ggml-remotingfrontend/frontend.o: ggml/src/ggml-remotingfrontend/frontend.cpp $(CXX) $(CXXFLAGS) -c $< -o $@ +ggml/src/ggml-remotingbackend/backend.o: ggml/src/ggml-remotingbackend/backend.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(_ggml_vk_header): $(_ggml_vk_source) $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 6db2c2ee3f2f5..9d7576c911635 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -180,6 +180,7 @@ option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_REMOTING_FRONTEND "ggml: use the API Remoting frontend" OFF) +option(GGML_REMOTING_BACKEND "ggml: use the API Remoting backend" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) @@ -271,6 +272,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-sycl.h include/ggml-vulkan.h ggml/include/ggml-remoting-frontend.h + ggml/include/ggml-remoting-backend.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-remoting-backend.h b/ggml/include/ggml-remoting-backend.h new file mode 100644 index 0000000000000..25a9dc269c957 --- /dev/null +++ b/ggml/include/ggml-remoting-backend.h @@ -0,0 +1,16 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_REMOTING_BACKEND_NAME "RemotingBackend" + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_backend_reg(); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 76c3f3d27fc16..63f36e67a00bb 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -310,6 +310,7 @@ ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) ggml_add_backend(RemotingFrontend) +ggml_add_backend(RemotingBackend) ggml_add_backend(OpenCL) foreach (target ggml-base ggml) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 45843e5ad190a..7e6d4f8c36f67 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -49,6 +49,10 @@ #include "ggml-remoting-frontend.h" #endif +#ifdef GGML_USE_REMOTINGBACKEND +#include "ggml-remoting-backend.h" +#endif + #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -179,6 +183,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_REMOTINGFRONTEND register_backend(ggml_backend_remoting_frontend_reg()); #endif +#ifdef GGML_USE_REMOTINGBACKEND + register_backend(ggml_backend_remoting_backend_reg()); +#endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt new file mode 100644 index 0000000000000..70b8d3a1b7fef --- /dev/null +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required(VERSION 3.19) +cmake_policy(SET CMP0114 NEW) + +message(STATUS "Enable API Remoting backend") + +ggml_add_backend_library(ggml-remotingbackend + backend.cpp + ../../include/ggml-remoting-backend.h + ) + +target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h new file mode 100644 index 0000000000000..97e9605b0dadb --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -0,0 +1,30 @@ +#include +#include + +static inline void LOG(const char* fmt, ...) { + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); +} + +static inline void FATAL(const char* fmt, ...) { + printf("FATAL: "); + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); + + if (!fmt) + return; // avoid the noreturn attribute + + exit(1); +} + +extern "C" { + void ggml_backend_remoting_backend_say_hello(); +} diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp new file mode 100644 index 0000000000000..ccc3b3a3aa136 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -0,0 +1,78 @@ +#include +#include + +#include "ggml-remoting-backend.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +#include "backend-internal.h" + +#define UNUSED GGML_UNUSED + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return 0; +} + +static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_REMOTING_BACKEND_NAME; +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + UNUSED(device); + + return NULL; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_remoting_backend_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ nullptr, + }; + + LOG("%s, hello :wave:", __func__); + + return ® +} + +typedef ggml_backend_reg_t (*backend_reg_fct_t)(void); + +#define METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" +#define ENTRYPOINT_FCT_NAME "ggml_backend_metal_reg" + +extern "C" { + void ggml_backend_remoting_backend_say_hello() { + LOG("%s: hello :wave: \\o/", __func__); + + void * library_handle = dlopen(METAL_LIBRARY_PATH, RTLD_LAZY); + + if (!library_handle) { + FATAL("Cannot open library: %s\n", dlerror()); + return; + } + + backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(library_handle, ENTRYPOINT_FCT_NAME); + const char* dlsym_error = dlerror(); + if (dlsym_error) { + FATAL("Cannot load symbol: %s\n", dlsym_error); + return; + } + + ggml_backend_reg_t reg = entrypoint_fct(); + LOG("%s: --> %s", __func__, reg->iface.get_name(reg)); + + dlclose(library_handle); + } +} From 022ddceaf7ded722b6c2d3afadf3163abbcb9ff8 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 15:07:22 +0200 Subject: [PATCH 024/117] remoting: start using shared header files --- .../src/ggml-remotingbackend/shared/api_remoting.h | 13 +++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 8 ++++---- ggml/src/ggml-remotingfrontend/virtgpu.h | 14 ++------------ 3 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/shared/api_remoting.h diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h new file mode 100644 index 0000000000000..0cac78cccdfda --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h @@ -0,0 +1,13 @@ + +#define VIRGL_VK_COMMAND_TYPE_LoadLibrary 255 +#define VIRGL_VK_COMMAND_TYPE_Forward 256 + + +static inline const char *api_remoting_command_name(int32_t type) +{ + switch (type) { + case VIRGL_VK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; + case VIRGL_VK_COMMAND_TYPE_Forward: return "Forward"; + default: return "unknown"; + } +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index e251b29577616..bc20c90cb36c2 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -71,8 +71,8 @@ create_virtgpu() { assert(false); } - remote_call(gpu, PK_COMMAND_TYPE_LoadLibrary, 0); - remote_call(gpu, PK_COMMAND_TYPE_SayHello, 0); + remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0); + remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 12346); thks_bye(); } @@ -386,7 +386,7 @@ static int remote_call( vn_encode_uint32_t(encoder, &reply_res_id); printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__, - command_name(cmd_type), + api_remoting_command_name(cmd_type), cmd_flags, reply_res_id); /* @@ -446,7 +446,7 @@ static int remote_call( vn_decode_int32_t(dec, &rmt_call_ret); printf("%s: call %s() --> %d\n", __func__, - command_name(cmd_type), rmt_call_ret); + api_remoting_command_name(cmd_type), rmt_call_ret); return rmt_call_ret; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index f3249207d85ad..bfd0dc9c82b15 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -9,6 +9,8 @@ #include #include +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/api_remoting.h" + void thks_bye(); #include "virtgpu-shm.h" @@ -170,15 +172,3 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu, static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); static void virtgpu_init_renderer_info(struct virtgpu *gpu); static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags); - -#define PK_COMMAND_TYPE_LoadLibrary 255 -#define PK_COMMAND_TYPE_SayHello 256 - -static inline const char *command_name(int32_t type) -{ - switch (type) { - case PK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; - case PK_COMMAND_TYPE_SayHello: return "SayHello"; - default: return "unknown"; - } -} From 9bde80bab7e1d19b61cce3646c1cf0bb779a2146 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 17:54:58 +0200 Subject: [PATCH 025/117] remotingbackend/CMakeLists: add header dependencies --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 3 + .../shared/apir_backend.h | 21 ++ .../ggml-remotingbackend/shared/venus_cs.h | 301 ++++++++++++++++++ 3 files changed, 325 insertions(+) create mode 100644 ggml/src/ggml-remotingbackend/shared/apir_backend.h create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs.h diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index 70b8d3a1b7fef..420e283fc8359 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -5,6 +5,9 @@ message(STATUS "Enable API Remoting backend") ggml_add_backend_library(ggml-remotingbackend backend.cpp + shared/api_remoting.h + shared/apir_backend.h + shared/venus_cs.h ../../include/ggml-remoting-backend.h ) diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h new file mode 100644 index 0000000000000..8506ffa46b759 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -0,0 +1,21 @@ +#pragma once + +#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-remotingbackend.dylib" +#define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize" +#define APIR_DEINIT_FCT_NAME "apir_backend_deinit" +#define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher" + +#define APIR_BACKEND_INITIALIZE_SUCCESSS 0 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1 +#define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 +#define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 +#define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 + +typedef uint32_t (*apir_backend_initialize_t)(void); +typedef void (*apir_backend_deinit_t)(void); + +typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after + ); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h new file mode 100644 index 0000000000000..d9397c6d5d647 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -0,0 +1,301 @@ +#pragma once + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct vn_cs_encoder { + char* cur; + const char *start; + const char* end; +}; + +struct vn_cs_decoder { + const char* cur; + const char* end; +}; + +/* + * encode peek + */ + +static inline bool +vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + assert(val_size <= size); + + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("DECODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; +} + +static inline void +vn_cs_decoder_peek(const struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + vn_cs_decoder_peek_internal(dec, size, val, val_size); +} + +/* + * read/write + */ + +static inline void +vn_cs_decoder_read(struct vn_cs_decoder *dec, + size_t size, + void *val, + size_t val_size) +{ + if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) + dec->cur += size; +} + +static inline void +vn_cs_encoder_write(struct vn_cs_encoder *enc, + size_t size, + const void *val, + size_t val_size) +{ + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); + + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(enc->cur, val, val_size); + enc->cur += size; +} + +/* + * encode/decode + */ + +static inline void +vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) +{ + assert(size % 4 == 0); + vn_cs_decoder_read(dec, size, data, data_size); +} + +static inline void +vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) +{ + assert(size % 4 == 0); + /* TODO check if the generated code is optimal */ + vn_cs_encoder_write(enc, size, data, data_size); +} + +/* + * typed encode/decode + */ + +/* uint64_t */ + +static inline size_t +vn_sizeof_uint64_t(const uint64_t *val) +{ + assert(sizeof(*val) == 8); + return 8; +} + +static inline void +vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) +{ + vn_encode(enc, 8, val, sizeof(*val)); +} + +static inline void +vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) +{ + vn_decode(dec, 8, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) +{ + assert(sizeof(*val) == 8); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* int32_t */ + +static inline size_t +vn_sizeof_int32_t(const int32_t *val) +{ + assert(sizeof(*val) == 4); + return 4; +} + +static inline void +vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* array size (uint64_t) */ + +static inline size_t +vn_sizeof_array_size(uint64_t size) +{ + return vn_sizeof_uint64_t(&size); +} + +static inline void +vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) +{ + vn_encode_uint64_t(enc, &size); +} + +static inline uint64_t +vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + if (size != expected_size) { + FATAL("ENCODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + size = 0; + } + return size; +} + +static inline uint64_t +vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_decode_uint64_t(dec, &size); + return size; +} + +static inline uint64_t +vn_peek_array_size(struct vn_cs_decoder *dec) +{ + uint64_t size; + vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); + return size; +} + +/* non-array pointer */ + +static inline size_t +vn_sizeof_simple_pointer(const void *val) +{ + return vn_sizeof_array_size(val ? 1 : 0); +} + +static inline bool +vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) +{ + vn_encode_array_size(enc, val ? 1 : 0); + return val; +} + +static inline bool +vn_decode_simple_pointer(struct vn_cs_decoder *dec) +{ + return vn_decode_array_size_unchecked(dec); +} + +/* uint32_t */ + +static inline size_t +vn_sizeof_uint32_t(const uint32_t *val) +{ + assert(sizeof(*val) == 4); + return 4; +} + +static inline void +vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) +{ + vn_encode(enc, 4, val, sizeof(*val)); +} + +static inline void +vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) +{ + vn_decode(dec, 4, val, sizeof(*val)); +} + +static inline size_t +vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) +{ + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; +} + +static inline void +vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); +} + +static inline void +vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) +{ + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} From b5ac3985205af7fa1dbc6648c3e1826a01019502 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 17:55:35 +0200 Subject: [PATCH 026/117] ggml-remotingbackend: add skeleton of argument passing --- .../ggml-remotingbackend/backend-internal.h | 30 +++++-- ggml/src/ggml-remotingbackend/backend.cpp | 81 +++++++++++++++---- 2 files changed, 90 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h index 97e9605b0dadb..e6c098ed95175 100644 --- a/ggml/src/ggml-remotingbackend/backend-internal.h +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -1,13 +1,24 @@ #include #include -static inline void LOG(const char* fmt, ...) { - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); +static inline void INFO(const char* fmt, ...) { + printf("INFO: "); + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); +} - printf("\n"); +static inline void ERROR(const char* fmt, ...) { + printf("ERROR: "); + va_list args; + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); + + printf("\n"); } static inline void FATAL(const char* fmt, ...) { @@ -26,5 +37,10 @@ static inline void FATAL(const char* fmt, ...) { } extern "C" { - void ggml_backend_remoting_backend_say_hello(); + uint32_t apir_backend_initialize(); + void apir_backend_deinit(void); + uint32_t apir_backend_dispatcher(uint32_t cmd_type, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after); } diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index ccc3b3a3aa136..d858b033e3c9d 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -8,6 +8,8 @@ #include "ggml-backend.h" #include "backend-internal.h" +#include "shared/apir_backend.h" +#include "shared/venus_cs.h" #define UNUSED GGML_UNUSED @@ -42,37 +44,88 @@ ggml_backend_reg_t ggml_backend_remoting_backend_reg() { /* .context = */ nullptr, }; - LOG("%s, hello :wave:", __func__); + INFO("%s, hello :wave:", __func__); return ® } typedef ggml_backend_reg_t (*backend_reg_fct_t)(void); -#define METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" -#define ENTRYPOINT_FCT_NAME "ggml_backend_metal_reg" +#define GGML_BACKEND_METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" +#define GGML_BACKEND_METAL_REG_FCT_NAME "ggml_backend_metal_reg" + +static void *backend_library_handle = NULL; extern "C" { - void ggml_backend_remoting_backend_say_hello() { - LOG("%s: hello :wave: \\o/", __func__); + void apir_backend_deinit(void) { + if (backend_library_handle) { + INFO("%s: The GGML backend library was loaded. Unloading it.", __func__); + dlclose(backend_library_handle); + } + + INFO("%s: bye-bye", __func__); + } - void * library_handle = dlopen(METAL_LIBRARY_PATH, RTLD_LAZY); + uint32_t apir_backend_initialize() { + INFO("%s: hello :wave: \\o/", __func__); - if (!library_handle) { - FATAL("Cannot open library: %s\n", dlerror()); - return; + backend_library_handle = dlopen(GGML_BACKEND_METAL_LIBRARY_PATH, RTLD_LAZY); + + if (!backend_library_handle) { + ERROR("Cannot open library: %s\n", dlerror()); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; } - backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(library_handle, ENTRYPOINT_FCT_NAME); + backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(backend_library_handle, GGML_BACKEND_METAL_REG_FCT_NAME); const char* dlsym_error = dlerror(); if (dlsym_error) { - FATAL("Cannot load symbol: %s\n", dlsym_error); - return; + ERROR("Cannot load symbol: %s\n", dlsym_error); + + return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } ggml_backend_reg_t reg = entrypoint_fct(); - LOG("%s: --> %s", __func__, reg->iface.get_name(reg)); + INFO("%s: --> %s", __func__, reg->iface.get_name(reg)); - dlclose(library_handle); + return APIR_BACKEND_INITIALIZE_SUCCESSS; + } + + uint32_t apir_backend_dispatcher(uint32_t cmd_type, + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after) { + INFO("%s: --> %d | %p | %p ", __func__, cmd_type, dec_cur, enc_cur); + + struct vn_cs_encoder _enc = { + .cur = enc_cur, + .end = enc_end, + }; + struct vn_cs_encoder *enc = &_enc; + + struct vn_cs_decoder _dec = { + .cur = dec_cur, + .end = dec_end, + }; + struct vn_cs_decoder *dec = &_dec; + + int32_t arg1, arg2, arg3; + vn_decode_int32_t(dec, &arg1); + vn_decode_int32_t(dec, &arg2); + vn_decode_int32_t(dec, &arg3); + + INFO("%s: ARGS %d %d %d\n", __func__, arg1, arg2, arg3); + + int32_t resp1 = 1; + int32_t resp2 = 2; + int32_t resp3 = 3; + int32_t resp4 = 4; + vn_encode_int32_t(enc, &resp1); + vn_encode_int32_t(enc, &resp2); + vn_encode_int32_t(enc, &resp3); + vn_encode_int32_t(enc, &resp4); + *enc_cur_after = enc->cur; + + return 0; } } From 0cdcdd269906b17b1c881a79813e05897485935a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 17:56:02 +0200 Subject: [PATCH 027/117] remotingfrontend: improve the typing --- .../src/ggml-remotingfrontend/virtgpu-types.h | 298 ------------------ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 85 +++-- ggml/src/ggml-remotingfrontend/virtgpu.h | 80 +---- .../src/ggml-remotingfrontend/virtgpu_venus.c | 209 ------------ 4 files changed, 65 insertions(+), 607 deletions(-) delete mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-types.h delete mode 100644 ggml/src/ggml-remotingfrontend/virtgpu_venus.c diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-types.h b/ggml/src/ggml-remotingfrontend/virtgpu-types.h deleted file mode 100644 index b0802ad634bcb..0000000000000 --- a/ggml/src/ggml-remotingfrontend/virtgpu-types.h +++ /dev/null @@ -1,298 +0,0 @@ -#pragma once -#include "virtgpu.h" - -struct vn_cs_encoder { - char* cur; - const char* end; -}; - -struct vn_cs_decoder { - const char* cur; - const char* end; -}; - -/* - * encode peek - */ - -static inline bool -vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, - size_t size, - void *val, - size_t val_size) -{ - assert(val_size <= size); - - if (unlikely(size > (size_t) (dec->end - dec->cur))) { - FATAL("DECODER IS FULL :/"); - //vn_cs_decoder_set_fatal(dec); - memset(val, 0, val_size); - return false; - } - - /* we should not rely on the compiler to optimize away memcpy... */ - memcpy(val, dec->cur, val_size); - return true; -} - -static inline void -vn_cs_decoder_peek(const struct vn_cs_decoder *dec, - size_t size, - void *val, - size_t val_size) -{ - vn_cs_decoder_peek_internal(dec, size, val, val_size); -} - -/* - * read/write - */ - -static inline void -vn_cs_decoder_read(struct vn_cs_decoder *dec, - size_t size, - void *val, - size_t val_size) -{ - if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) - dec->cur += size; -} - -static inline void -vn_cs_encoder_write(struct vn_cs_encoder *enc, - size_t size, - const void *val, - size_t val_size) -{ - assert(val_size <= size); - assert(size <= ((size_t) (enc->end - enc->cur))); - - /* we should not rely on the compiler to optimize away memcpy... */ - memcpy(enc->cur, val, val_size); - enc->cur += size; -} - -/* - * encode/decode - */ - -static inline void -vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) -{ - assert(size % 4 == 0); - vn_cs_decoder_read(dec, size, data, data_size); -} - -static inline void -vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) -{ - assert(size % 4 == 0); - /* TODO check if the generated code is optimal */ - vn_cs_encoder_write(enc, size, data, data_size); -} - -/* - * typed encode/decode - */ - -/* uint64_t */ - -static inline size_t -vn_sizeof_uint64_t(const uint64_t *val) -{ - assert(sizeof(*val) == 8); - return 8; -} - -static inline void -vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) -{ - vn_encode(enc, 8, val, sizeof(*val)); -} - -static inline void -vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) -{ - vn_decode(dec, 8, val, sizeof(*val)); -} - -static inline size_t -vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) -{ - assert(sizeof(*val) == 8); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; -} - -static inline void -vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); -} - -static inline void -vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); -} - -/* int32_t */ - -static inline size_t -vn_sizeof_int32_t(const int32_t *val) -{ - assert(sizeof(*val) == 4); - return 4; -} - -static inline void -vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) -{ - vn_encode(enc, 4, val, sizeof(*val)); -} - -static inline void -vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) -{ - vn_decode(dec, 4, val, sizeof(*val)); -} - -static inline size_t -vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) -{ - assert(sizeof(*val) == 4); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; -} - -static inline void -vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); -} - -static inline void -vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); -} - -/* array size (uint64_t) */ - -static inline size_t -vn_sizeof_array_size(uint64_t size) -{ - return vn_sizeof_uint64_t(&size); -} - -static inline void -vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) -{ - vn_encode_uint64_t(enc, &size); -} - -static inline uint64_t -vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) -{ - uint64_t size; - vn_decode_uint64_t(dec, &size); - if (size != expected_size) { - FATAL("ENCODER IS FULL :/"); - //vn_cs_decoder_set_fatal(dec); - size = 0; - } - return size; -} - -static inline uint64_t -vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) -{ - uint64_t size; - vn_decode_uint64_t(dec, &size); - return size; -} - -static inline uint64_t -vn_peek_array_size(struct vn_cs_decoder *dec) -{ - uint64_t size; - vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); - return size; -} - -/* non-array pointer */ - -static inline size_t -vn_sizeof_simple_pointer(const void *val) -{ - return vn_sizeof_array_size(val ? 1 : 0); -} - -static inline bool -vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) -{ - vn_encode_array_size(enc, val ? 1 : 0); - return val; -} - -static inline bool -vn_decode_simple_pointer(struct vn_cs_decoder *dec) -{ - return vn_decode_array_size_unchecked(dec); -} - -/* uint32_t */ - -static inline size_t -vn_sizeof_uint32_t(const uint32_t *val) -{ - assert(sizeof(*val) == 4); - return 4; -} - -static inline void -vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) -{ - vn_encode(enc, 4, val, sizeof(*val)); -} - -static inline void -vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) -{ - vn_decode(dec, 4, val, sizeof(*val)); -} - -static inline size_t -vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) -{ - assert(sizeof(*val) == 4); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; -} - -static inline void -vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); -} - -static inline void -vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) -{ - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); -} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index bc20c90cb36c2..a88d07c8198fd 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -6,7 +6,6 @@ #include #include "virtgpu.h" -#include "virtgpu-types.h" static inline void virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) @@ -37,7 +36,7 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) void *something = NULL; void thks_bye () { // break here - INFO("thks bye, stopping early."); + INFO("thks bye, stopping early and happilly :)"); if (!something) { // avoid the [[noreturn]] detection mechanism exit(0); } @@ -50,17 +49,17 @@ create_virtgpu() { util_sparse_array_init(&gpu->shmem_array, sizeof(struct virtgpu_shmem), 1024); - VkResult result = virtgpu_open(gpu); - assert(result == VK_SUCCESS); + virt_gpu_result_t result = virtgpu_open(gpu); + assert(result == APIR_SUCCESS); result = virtgpu_init_params(gpu); - assert(result == VK_SUCCESS); + assert(result == APIR_SUCCESS); result = virtgpu_init_capset(gpu); - assert(result == VK_SUCCESS); + assert(result == APIR_SUCCESS); result = virtgpu_init_context(gpu); - assert(result == VK_SUCCESS); + assert(result == APIR_SUCCESS); virtgpu_init_shmem_blob_mem(gpu); @@ -71,26 +70,33 @@ create_virtgpu() { assert(false); } - remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0); - remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 12346); - + uint32_t ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0, 0, 0, 0); + if (ret != 0) { + FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret); + assert(false); + } + ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 0, 111, 555, 999); + if (ret != 0) { + FATAL("%s: failed to forard the API call (code=%d):/", __func__, ret); + assert(false); + } thks_bye(); } -static VkResult +static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu) { drmDevicePtr devs[8]; int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); if (count < 0) { INFO("failed to enumerate DRM devices"); - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } - VkResult result = VK_ERROR_INITIALIZATION_FAILED; + virt_gpu_result_t result = APIR_ERROR_INITIALIZATION_FAILED; for (int i = 0; i < count; i++) { result = virtgpu_open_device(gpu, devs[i]); - if (result == VK_SUCCESS) + if (result == APIR_SUCCESS) break; } @@ -99,7 +105,7 @@ virtgpu_open(struct virtgpu *gpu) return result; } -static VkResult +static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) { bool supported_bus = false; @@ -128,7 +134,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) } vn_log(gpu->instance, "skipping DRM device %s", name); } - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } const char *primary_path = dev->nodes[DRM_NODE_PRIMARY]; @@ -138,7 +144,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) if (fd < 0) { if (VN_DEBUG(INIT)) vn_log(gpu->instance, "failed to open %s", node_path); - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } drmVersionPtr version = drmGetVersion(fd); @@ -155,7 +161,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) if (version) drmFreeVersion(version); close(fd); - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } gpu->fd = fd; @@ -183,7 +189,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) if (VN_DEBUG(INIT)) vn_log(gpu->instance, "using DRM device %s", node_path); - return VK_SUCCESS; + return APIR_SUCCESS; } void @@ -202,9 +208,7 @@ vn_log(struct remoting_dev_instance *instance, const char *format, ...) /* instance may be NULL or partially initialized */ } - - -static VkResult +static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu) { assert(!gpu->capset.version); @@ -214,13 +218,13 @@ virtgpu_init_context(struct virtgpu *gpu) vn_log(gpu->instance, "failed to initialize context: %s", strerror(errno)); } - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } - return VK_SUCCESS; + return APIR_SUCCESS; } -static VkResult +static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu) { gpu->capset.id = VIRGL_RENDERER_CAPSET_VENUS; @@ -234,13 +238,13 @@ virtgpu_init_capset(struct virtgpu *gpu) vn_log(gpu->instance, "failed to get venus v%d capset: %s", gpu->capset.version, strerror(errno)); } - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } - return VK_SUCCESS; + return APIR_SUCCESS; } -static VkResult +static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu) { const uint64_t required_params[] = { @@ -255,7 +259,7 @@ virtgpu_init_params(struct virtgpu *gpu) vn_log(gpu->instance, "required kernel param %d is missing", (int)required_params[i]); } - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } } @@ -273,7 +277,7 @@ virtgpu_init_params(struct virtgpu *gpu) vn_log(gpu->instance, "one of required kernel params (%d or %d) is missing", (int)VIRTGPU_PARAM_HOST_VISIBLE, (int)VIRTGPU_PARAM_GUEST_VRAM); - return VK_ERROR_INITIALIZATION_FAILED; + return APIR_ERROR_INITIALIZATION_FAILED; } /* Cross-device feature is optional. It enables sharing dma-bufs @@ -287,7 +291,7 @@ virtgpu_init_params(struct virtgpu *gpu) /* implied by CONTEXT_INIT uapi */ gpu->max_timeline_count = 64; - return VK_SUCCESS; + return APIR_SUCCESS; } static int @@ -354,7 +358,8 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) static int remote_call( struct virtgpu *gpu, int32_t cmd_type, - int32_t cmd_flags + int32_t cmd_flags, + int32_t arg1, int32_t arg2, int32_t arg3 ) { @@ -374,7 +379,6 @@ static int remote_call( * Fill the command encoder buffer */ - /* VkCommandTypeEXT is int32_t */ vn_encode_int32_t(encoder, &cmd_type); vn_encode_int32_t(encoder, &cmd_flags); @@ -389,6 +393,10 @@ static int remote_call( api_remoting_command_name(cmd_type), cmd_flags, reply_res_id); + vn_encode_int32_t(encoder, &arg1); + vn_encode_int32_t(encoder, &arg2); + vn_encode_int32_t(encoder, &arg3); + /* * Reply notification pointer */ @@ -442,9 +450,20 @@ static int remote_call( }; struct vn_cs_decoder *dec = &_dec; + int32_t resp1; + int32_t resp2; + int32_t resp3; + int32_t resp4; + vn_decode_int32_t(dec, &resp1); + vn_decode_int32_t(dec, &resp2); + vn_decode_int32_t(dec, &resp3); + vn_decode_int32_t(dec, &resp4); + int32_t rmt_call_ret; vn_decode_int32_t(dec, &rmt_call_ret); + printf("%s: RESP %d %d %d %d\n", __func__, resp1, resp2, resp3, resp4); + printf("%s: call %s() --> %d\n", __func__, api_remoting_command_name(cmd_type), rmt_call_ret); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index bfd0dc9c82b15..379a2174fc3db 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -9,12 +9,13 @@ #include #include -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/api_remoting.h" +#include "virtgpu-utils.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h" void thks_bye(); #include "virtgpu-shm.h" -#include "virtgpu-utils.h" #define VIRGL_RENDERER_UNSTABLE_APIS 1 #include "drm-uapi/virtgpu_drm.h" @@ -31,65 +32,10 @@ void thks_bye(); #define VN_DEBUG(what) true -typedef enum VkResult { - VK_SUCCESS = 0, - VK_NOT_READY = 1, - VK_TIMEOUT = 2, - VK_EVENT_SET = 3, - VK_EVENT_RESET = 4, - VK_INCOMPLETE = 5, - VK_ERROR_OUT_OF_HOST_MEMORY = -1, - VK_ERROR_OUT_OF_DEVICE_MEMORY = -2, - VK_ERROR_INITIALIZATION_FAILED = -3, - VK_ERROR_DEVICE_LOST = -4, - VK_ERROR_MEMORY_MAP_FAILED = -5, - VK_ERROR_LAYER_NOT_PRESENT = -6, - VK_ERROR_EXTENSION_NOT_PRESENT = -7, - VK_ERROR_FEATURE_NOT_PRESENT = -8, - VK_ERROR_INCOMPATIBLE_DRIVER = -9, - VK_ERROR_TOO_MANY_OBJECTS = -10, - VK_ERROR_FORMAT_NOT_SUPPORTED = -11, - VK_ERROR_FRAGMENTED_POOL = -12, - VK_ERROR_UNKNOWN = -13, - VK_ERROR_OUT_OF_POOL_MEMORY = -1000069000, - VK_ERROR_INVALID_EXTERNAL_HANDLE = -1000072003, - VK_ERROR_FRAGMENTATION = -1000161000, - VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS = -1000257000, - VK_PIPELINE_COMPILE_REQUIRED = 1000297000, - VK_ERROR_SURFACE_LOST_KHR = -1000000000, - VK_ERROR_NATIVE_WINDOW_IN_USE_KHR = -1000000001, - VK_SUBOPTIMAL_KHR = 1000001003, - VK_ERROR_OUT_OF_DATE_KHR = -1000001004, - VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001, - VK_ERROR_VALIDATION_FAILED_EXT = -1000011001, - VK_ERROR_INVALID_SHADER_NV = -1000012000, - VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR = -1000023000, - VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR = -1000023001, - VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR = -1000023002, - VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR = -1000023003, - VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR = -1000023004, - VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR = -1000023005, - VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT = -1000158000, - VK_ERROR_NOT_PERMITTED_KHR = -1000174001, - VK_ERROR_FULL_SCREEN_EXCLUSIVE_MODE_LOST_EXT = -1000255000, - VK_THREAD_IDLE_KHR = 1000268000, - VK_THREAD_DONE_KHR = 1000268001, - VK_OPERATION_DEFERRED_KHR = 1000268002, - VK_OPERATION_NOT_DEFERRED_KHR = 1000268003, - VK_ERROR_INVALID_VIDEO_STD_PARAMETERS_KHR = -1000299000, - VK_ERROR_COMPRESSION_EXHAUSTED_EXT = -1000338000, - VK_INCOMPATIBLE_SHADER_BINARY_EXT = 1000482000, - VK_ERROR_OUT_OF_POOL_MEMORY_KHR = VK_ERROR_OUT_OF_POOL_MEMORY, - VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = VK_ERROR_INVALID_EXTERNAL_HANDLE, - VK_ERROR_FRAGMENTATION_EXT = VK_ERROR_FRAGMENTATION, - VK_ERROR_NOT_PERMITTED_EXT = VK_ERROR_NOT_PERMITTED_KHR, - VK_ERROR_INVALID_DEVICE_ADDRESS_EXT = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, - VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS_KHR = VK_ERROR_INVALID_OPAQUE_CAPTURE_ADDRESS, - VK_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED, - VK_ERROR_PIPELINE_COMPILE_REQUIRED_EXT = VK_PIPELINE_COMPILE_REQUIRED, - VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT = VK_INCOMPATIBLE_SHADER_BINARY_EXT, - VK_RESULT_MAX_ENUM = 0x7FFFFFFF -} VkResult; +typedef enum virt_gpu_result_t { + APIR_SUCCESS = 0, + APIR_ERROR_INITIALIZATION_FAILED = -1, +} virt_gpu_result_t; struct remoting_dev_instance { @@ -153,13 +99,13 @@ virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) } void create_virtgpu(); -static VkResult virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); -static VkResult virtgpu_open(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); +static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu); -static VkResult virtgpu_init_params(struct virtgpu *gpu); -static VkResult virtgpu_init_capset(struct virtgpu *gpu); -static VkResult virtgpu_init_context(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu); static int virtgpu_ioctl_context_init(struct virtgpu *gpu, enum virgl_renderer_capset capset_id); @@ -171,4 +117,4 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu, size_t capset_size); static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); static void virtgpu_init_renderer_info(struct virtgpu *gpu); -static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags); +static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags, int32_t arg1, int32_t arg2, int32_t arg3); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c b/ggml/src/ggml-remotingfrontend/virtgpu_venus.c deleted file mode 100644 index fc401c13d3003..0000000000000 --- a/ggml/src/ggml-remotingfrontend/virtgpu_venus.c +++ /dev/null @@ -1,209 +0,0 @@ -static inline void vn_encode_vkEnumeratePhysicalDevices(struct vn_cs_encoder *enc, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices) -{ - const VkCommandTypeEXT cmd_type = VK_COMMAND_TYPE_vkEnumeratePhysicalDevices_EXT; - - vn_encode_VkCommandTypeEXT(enc, &cmd_type); - vn_encode_VkFlags(enc, &cmd_flags); - - vn_encode_VkInstance(enc, &instance); - if (vn_encode_simple_pointer(enc, pPhysicalDeviceCount)) - vn_encode_uint32_t(enc, pPhysicalDeviceCount); - if (pPhysicalDevices) { - vn_encode_array_size(enc, (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0)); - for (uint32_t i = 0; i < (pPhysicalDeviceCount ? *pPhysicalDeviceCount : 0); i++) - vn_encode_VkPhysicalDevice(enc, &pPhysicalDevices[i]); - } else { - vn_encode_array_size(enc, 0); - } -} - -static inline struct vn_cs_encoder * -vn_ring_submit_command_init(struct vn_ring *ring, - struct vn_ring_submit_command *submit, - void *cmd_data, - size_t cmd_size, - size_t reply_size) -{ - submit->buffer = VN_CS_ENCODER_BUFFER_INITIALIZER(cmd_data); - submit->command = VN_CS_ENCODER_INITIALIZER(&submit->buffer, cmd_size); - - submit->reply_size = reply_size; - submit->reply_shmem = NULL; - - submit->ring_seqno_valid = false; - - return &submit->command; -} - -static inline void vn_submit_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkCommandFlagsEXT cmd_flags, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices, struct vn_ring_submit_command *submit) -{ - uint8_t local_cmd_data[VN_SUBMIT_LOCAL_CMD_SIZE]; - void *cmd_data = local_cmd_data; - size_t cmd_size = vn_sizeof_vkEnumeratePhysicalDevices(instance, pPhysicalDeviceCount, pPhysicalDevices); - if (cmd_size > sizeof(local_cmd_data)) { - cmd_data = malloc(cmd_size); - if (!cmd_data) - cmd_size = 0; - } - const size_t reply_size = cmd_flags & VK_COMMAND_GENERATE_REPLY_BIT_EXT ? vn_sizeof_vkEnumeratePhysicalDevices_reply(instance, pPhysicalDeviceCount, pPhysicalDevices) : 0; - - struct vn_cs_encoder *enc = vn_ring_submit_command_init(vn_ring, submit, cmd_data, cmd_size, reply_size); - if (cmd_size) { - vn_encode_vkEnumeratePhysicalDevices(enc, cmd_flags, instance, pPhysicalDeviceCount, pPhysicalDevices); - vn_ring_submit_command(vn_ring, submit); - if (cmd_data != local_cmd_data) - free(cmd_data); - } -} - -VkResult vn_call_vkEnumeratePhysicalDevices(struct vn_ring *vn_ring, VkInstance instance, uint32_t* pPhysicalDeviceCount, VkPhysicalDevice* pPhysicalDevices) -{ - VN_TRACE_FUNC(); - - struct vn_ring_submit_command submit; - vn_submit_vkEnumeratePhysicalDevices(vn_ring, VK_COMMAND_GENERATE_REPLY_BIT_EXT, instance, pPhysicalDeviceCount, pPhysicalDevices, &submit); - struct vn_cs_decoder *dec = vn_ring_get_command_reply(vn_ring, &submit); - if (dec) { - const VkResult ret = vn_decode_vkEnumeratePhysicalDevices_reply(dec, instance, pPhysicalDeviceCount, pPhysicalDevices); - vn_ring_free_command_reply(vn_ring, &submit); - return ret; - } else { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } -} - -VkResult -vn_ring_submit_command_simple(struct vn_ring *ring, - const struct vn_cs_encoder *cs) -{ - mtx_lock(&ring->mutex); - VkResult result = vn_ring_submit_locked(ring, cs, NULL, NULL); - mtx_unlock(&ring->mutex); - - return result; -} - -static VkResult -vn_ring_submit_locked(struct vn_ring *ring, - const struct vn_cs_encoder *cs, - struct vn_renderer_shmem *extra_shmem, - uint32_t *ring_seqno) -{ - const bool direct = vn_ring_submission_can_direct(ring, cs); - if (!direct && cs->storage_type == VN_CS_ENCODER_STORAGE_POINTER) { - cs = vn_ring_cs_upload_locked(ring, cs); - if (!cs) - return VK_ERROR_OUT_OF_HOST_MEMORY; - assert(cs->storage_type != VN_CS_ENCODER_STORAGE_POINTER); - } - - struct vn_ring_submission submit; - VkResult result = - vn_ring_submission_prepare(ring, &submit, cs, extra_shmem, direct); - if (result != VK_SUCCESS) - return result; - - uint32_t seqno; - const bool notify = - vn_ring_submit_internal(ring, submit.submit, submit.cs, &seqno); - if (notify) { - uint32_t notify_ring_data[8]; - struct vn_cs_encoder local_enc = VN_CS_ENCODER_INITIALIZER_LOCAL( - notify_ring_data, sizeof(notify_ring_data)); - vn_encode_vkNotifyRingMESA(&local_enc, 0, ring->id, seqno, 0); - vn_renderer_submit_simple(ring->instance->renderer, notify_ring_data, - vn_cs_encoder_get_len(&local_enc)); - } - - vn_ring_submission_cleanup(&submit); - - if (ring_seqno) - *ring_seqno = seqno; - - return VK_SUCCESS; -} - -static VkResult -vn_ring_submission_prepare(struct vn_ring *ring, - struct vn_ring_submission *submit, - const struct vn_cs_encoder *cs, - struct vn_renderer_shmem *extra_shmem, - bool direct) -{ - submit->cs = vn_ring_submission_get_cs(submit, cs, direct); - if (!submit->cs) - return VK_ERROR_OUT_OF_HOST_MEMORY; - - submit->submit = - vn_ring_submission_get_ring_submit(ring, cs, extra_shmem, direct); - if (!submit->submit) { - vn_ring_submission_cleanup(submit); - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - return VK_SUCCESS; -} - -static bool -vn_ring_submit_internal(struct vn_ring *ring, - struct vn_ring_submit *submit, - const struct vn_cs_encoder *cs, - uint32_t *seqno) -{ - /* write cs to the ring */ - assert(!vn_cs_encoder_is_empty(cs)); - - /* avoid -Wmaybe-unitialized */ - uint32_t cur_seqno = 0; - - for (uint32_t i = 0; i < cs->buffer_count; i++) { - const struct vn_cs_encoder_buffer *buf = &cs->buffers[i]; - cur_seqno = vn_ring_wait_space(ring, buf->committed_size); - vn_ring_write_buffer(ring, buf->base, buf->committed_size); - } - - vn_ring_store_tail(ring); - const VkRingStatusFlagsMESA status = vn_ring_load_status(ring); - if (status & VK_RING_STATUS_FATAL_BIT_MESA) { - vn_log(NULL, "vn_ring_submit abort on fatal"); - abort(); - } - - vn_ring_retire_submits(ring, cur_seqno); - - submit->seqno = ring->cur; - list_addtail(&submit->head, &ring->submits); - - *seqno = submit->seqno; - - /* Notify renderer to wake up idle ring if at least VN_RING_IDLE_TIMEOUT_NS - * has passed since the last sent notification to avoid excessive wake up - * calls (non-trivial since submitted via virtio-gpu kernel). - */ - if (status & VK_RING_STATUS_IDLE_BIT_MESA) { - const int64_t now = os_time_get_nano(); - if (os_time_timeout(ring->last_notify, ring->next_notify, now)) { - ring->last_notify = now; - ring->next_notify = now + VN_RING_IDLE_TIMEOUT_NS; - return true; - } - } - return false; -} - -static void -vn_ring_write_buffer(struct vn_ring *ring, const void *data, uint32_t size) -{ - assert(ring->cur + size - vn_ring_load_head(ring) <= ring->buffer_size); - - const uint32_t offset = ring->cur & ring->buffer_mask; - if (offset + size <= ring->buffer_size) { - memcpy(ring->shared.buffer + offset, data, size); - } else { - const uint32_t s = ring->buffer_size - offset; - memcpy(ring->shared.buffer + offset, data, s); - memcpy(ring->shared.buffer, data + s, size - s); - } - - ring->cur += size; -} From f15fedf17ca16e4863508298794e1ce87e83a123 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 6 May 2025 17:56:17 +0200 Subject: [PATCH 028/117] podman_compile: delete the pod before compiling --- podman_compile.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/podman_compile.sh b/podman_compile.sh index 47e4baee07037..4793b4ce20fa2 100755 --- a/podman_compile.sh +++ b/podman_compile.sh @@ -19,9 +19,12 @@ fi cmd="bash ./build.$what.sh" +POD_NAME=mac_ai_compiling +podman machine ssh podman rm $POD_NAME --force + set -x podman run \ ---name mac_ai_compiling \ +--name $POD_NAME \ --user root:root \ --cgroupns host \ --security-opt label=disable \ From 0c264b180241db33a3555babdde02d8cd6f73e3e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 10:27:03 +0200 Subject: [PATCH 029/117] virtgpu-utils: add WARNING --- ggml/src/ggml-remotingfrontend/virtgpu-utils.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index 9d1589c9128ab..7da90be25c380 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -33,6 +33,17 @@ INFO(const char *format, ...) { va_end(argptr); } +inline void +WARNING(const char *format, ...) { + fprintf(stderr, "WARNING: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + inline void FATAL(const char *format, ...) { fprintf(stderr, "FATAL: "); @@ -42,7 +53,7 @@ FATAL(const char *format, ...) { vfprintf(stderr, format, argptr); fprintf(stderr, "\n"); va_end(argptr); - exit(1); + assert(false); } static inline bool From 938ba6b7a51ed844b158b48b8a2a13601ae02a8f Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 10:27:37 +0200 Subject: [PATCH 030/117] virtgpu: split the remote call into prepare/call/finish --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 149 +++++++++++++-------- ggml/src/ggml-remotingfrontend/virtgpu.h | 8 +- 2 files changed, 102 insertions(+), 55 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index a88d07c8198fd..db484f04e9d6a 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -67,19 +67,59 @@ create_virtgpu() { if (!gpu->reply_shmem) { FATAL("%s: failed to create the reply shared memory page :/", __func__); - assert(false); } - uint32_t ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0, 0, 0, 0); + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + int32_t ret; + + encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + ret = remote_call_finish(encoder, decoder); if (ret != 0) { FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret); - assert(false); } - ret = remote_call(gpu, VIRGL_VK_COMMAND_TYPE_Forward, 0, 111, 555, 999); + + int32_t forward_flag = 0; + encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + int32_t arg1 = 11; + int32_t arg2 = 22; + int32_t arg3 = 33; + + vn_encode_int32_t(encoder, &arg1); + vn_encode_int32_t(encoder, &arg2); + vn_encode_int32_t(encoder, &arg3); + decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + int32_t resp1; + int32_t resp2; + int32_t resp3; + int32_t resp4; + vn_decode_int32_t(decoder, &resp1); + vn_decode_int32_t(decoder, &resp2); + vn_decode_int32_t(decoder, &resp3); + vn_decode_int32_t(decoder, &resp4); + INFO("%s: Forward RESP %d %d %d %d", __func__, resp1, resp2, resp3, resp4); + + ret = remote_call_finish(encoder, decoder); if (ret != 0) { - FATAL("%s: failed to forard the API call (code=%d):/", __func__, ret); - assert(false); + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); } + thks_bye(); } @@ -355,50 +395,71 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) } -static int remote_call( +static struct vn_cs_encoder *remote_call_prepare( struct virtgpu *gpu, int32_t cmd_type, - int32_t cmd_flags, - int32_t arg1, int32_t arg2, int32_t arg3 - ) + int32_t cmd_flags) { + if (!gpu->reply_shmem) { + FATAL("%s: the reply shmem page can't be null", __func__); + } + /* - * Prepare the command encoder buffer + * Prepare the command encoder and its buffer */ - char encoder_buffer[4096]; + static char encoder_buffer[4096]; - struct vn_cs_encoder _encoder = { + static struct vn_cs_encoder enc; + enc = { + encoder_buffer, encoder_buffer, encoder_buffer + sizeof(encoder_buffer), }; - struct vn_cs_encoder *encoder = &_encoder; /* - * Fill the command encoder buffer + * Fill the command encoder with the common args: + * - cmd_type (int32_t) + * - cmd_flags (int32_t) + * - reply res id (uint32_t) */ - vn_encode_int32_t(encoder, &cmd_type); - vn_encode_int32_t(encoder, &cmd_flags); - - if (!gpu->reply_shmem) { - FATAL("%s: the reply shmem page can't be null", __func__); - } + vn_encode_int32_t(&enc, &cmd_type); + vn_encode_int32_t(&enc, &cmd_flags); uint32_t reply_res_id = gpu->reply_shmem->res_id; - vn_encode_uint32_t(encoder, &reply_res_id); + vn_encode_uint32_t(&enc, &reply_res_id); - printf("%s: call %s(flags=0x%x, reply_buf=%d)\n", __func__, + printf("%s: prepare %s(flags=0x%x, reply_buf=%d)\n", __func__, api_remoting_command_name(cmd_type), cmd_flags, reply_res_id); - vn_encode_int32_t(encoder, &arg1); - vn_encode_int32_t(encoder, &arg2); - vn_encode_int32_t(encoder, &arg3); + return &enc; +} + +static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + if (!enc) { + WARNING("Invalid (null) encoder :/"); + } + if (!dec) { + FATAL("Invalid (null) decoder :/"); + } + int32_t remote_call_ret; + vn_decode_int32_t(dec, &remote_call_ret); + + // encoder and decoder are statically allocated, nothing to do to release them + + return remote_call_ret; +} +static struct vn_cs_decoder *remote_call( + struct virtgpu *gpu, + struct vn_cs_encoder *encoder + ) +{ /* - * Reply notification pointer + * Prepare the reply notification pointer */ volatile std::atomic_uint *atomic_reply_notif = (volatile std::atomic_uint *) gpu->reply_shmem->mmap_ptr; @@ -410,8 +471,8 @@ static int remote_call( struct drm_virtgpu_execbuffer args = { .flags = VIRTGPU_EXECBUF_RING_IDX, - .size = sizeof(encoder_buffer), - .command = (uintptr_t) encoder_buffer, + .size = (uint32_t) (encoder->cur - encoder->start), + .command = (uintptr_t) encoder->start, .bo_handles = 0, .num_bo_handles = 0, @@ -441,31 +502,11 @@ static int remote_call( } /* - * Read the reply + * Prepare the decoder */ + static struct vn_cs_decoder dec; + dec.cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif); + dec.end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size; - struct vn_cs_decoder _dec = { - .cur = (char *) gpu->reply_shmem->mmap_ptr + sizeof(*atomic_reply_notif), - .end = (char *) gpu->reply_shmem->mmap_ptr + gpu->reply_shmem->mmap_size, - }; - struct vn_cs_decoder *dec = &_dec; - - int32_t resp1; - int32_t resp2; - int32_t resp3; - int32_t resp4; - vn_decode_int32_t(dec, &resp1); - vn_decode_int32_t(dec, &resp2); - vn_decode_int32_t(dec, &resp3); - vn_decode_int32_t(dec, &resp4); - - int32_t rmt_call_ret; - vn_decode_int32_t(dec, &rmt_call_ret); - - printf("%s: RESP %d %d %d %d\n", __func__, resp1, resp2, resp3, resp4); - - printf("%s: call %s() --> %d\n", __func__, - api_remoting_command_name(cmd_type), rmt_call_ret); - - return rmt_call_ret; + return &dec; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 379a2174fc3db..faef2a02bc7d8 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -117,4 +117,10 @@ virtgpu_ioctl_get_caps(struct virtgpu *gpu, size_t capset_size); static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); static void virtgpu_init_renderer_info(struct virtgpu *gpu); -static int remote_call(struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags, int32_t arg1, int32_t arg2, int32_t arg3); + +static struct vn_cs_encoder *remote_call_prepare( + struct virtgpu *gpu, + int32_t cmd_type, + int32_t cmd_flags); +static struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc); +static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); From 0582bab359865d99119c3fee0237cac820dc7d60 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 10:34:24 +0200 Subject: [PATCH 031/117] ggml-backend-reg: reindent --- .../ggml-backend-reg.cpp | 79 ++++++++++--------- .../src/ggml-remotingfrontend/ggml-remoting.h | 35 ++++---- 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index cb77a31a037c8..93f35f7e2e26e 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -4,66 +4,67 @@ #include "ggml-remoting.h" static int ggml_backend_remoting_get_device_count() { - return 1; + + return 1; } static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { - UNUSED(reg); - return ggml_backend_remoting_get_device_count(); + UNUSED(reg); + return ggml_backend_remoting_get_device_count(); } static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; + static std::vector devices; - static bool initialized = false; + static bool initialized = false; - { - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { - create_virtgpu(); + create_virtgpu(); - for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { - ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; - char desc[256] = "API Remoting device"; + for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { + ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; + char desc[256] = "API Remoting device"; - ctx->device = i; - ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); - ctx->description = desc; - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_remoting_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); - } - initialized = true; - } + ctx->device = i; + ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_remoting_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; } + } - GGML_ASSERT(device < devices.size()); - return devices[device]; + GGML_ASSERT(device < devices.size()); + return devices[device]; } static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { - UNUSED(reg); - return GGML_REMOTING_FRONTEND_NAME; + UNUSED(reg); + return GGML_REMOTING_FRONTEND_NAME; } static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { - /* .get_name = */ ggml_backend_remoting_reg_get_name, - /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, - /* .get_device = */ ggml_backend_remoting_reg_get_device, - /* .get_proc_address = */ NULL, + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, }; ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_remoting_reg_i, - /* .context = */ nullptr, - }; + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ nullptr, + }; - RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); - return ® + RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); + return ® } diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index c6acdf6cfe1c8..7dcc6641f7574 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -15,9 +15,10 @@ #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl struct ggml_backend_remoting_device_context { - size_t device; - std::string name; - std::string description; + size_t device; + std::string name; + std::string description; + }; extern const struct ggml_backend_device_i ggml_backend_remoting_device_i; @@ -38,24 +39,24 @@ typedef std::shared_ptr remoting_device; typedef std::weak_ptr remoting_device_ref; struct ggml_backend_remoting_buffer_context { - remoting_device_ref device; - remoting_buffer dev_buffer; - std::string name; - - ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : - name(name) { - UNUSED(device); - UNUSED(dev_buffer); - } - - ~ggml_backend_remoting_buffer_context() { - ggml_remoting_destroy_buffer(dev_buffer); - } + remoting_device_ref device; + remoting_buffer dev_buffer; + std::string name; + + ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : + name(name) { + UNUSED(device); + UNUSED(dev_buffer); + } + + ~ggml_backend_remoting_buffer_context() { + ggml_remoting_destroy_buffer(dev_buffer); + } }; struct remoting_context_struct { - int i; + int i; }; typedef std::shared_ptr remoting_context; typedef std::weak_ptr remoting_context_ref; From be5f5e0e1042087d0a67a9c998bf9fe250f308d9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 10:34:44 +0200 Subject: [PATCH 032/117] move thks_bye() to virtgpu-utils --- ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp | 11 +++++++++++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 9 --------- ggml/src/ggml-remotingfrontend/virtgpu.h | 2 -- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp index 100f495add1bc..f1af0d3391550 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -8,6 +8,8 @@ #define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1) #define NULL_NODE 0 +void thks_bye(); + #define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) #define os_free_aligned(_ptr) free(_ptr) #define p_atomic_cmpxchg(v, old, _new) \ @@ -184,3 +186,12 @@ util_sparse_array_get(struct util_sparse_array *arr, uint64_t idx) uint64_t elem_idx = idx & ((1ull << node_size_log2) - 1); return (void *)((char *)node_data + (elem_idx * arr->elem_size)); } + +void *something = NULL; +void thks_bye () { + // break here + INFO("thks bye, stopping early and happilly :)"); + if (!something) { // avoid the [[noreturn]] detection mechanism + exit(0); + } +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index db484f04e9d6a..fbffbb361f016 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -33,15 +33,6 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; } -void *something = NULL; -void thks_bye () { - // break here - INFO("thks bye, stopping early and happilly :)"); - if (!something) { // avoid the [[noreturn]] detection mechanism - exit(0); - } -} - void create_virtgpu() { struct virtgpu *gpu = new struct virtgpu(); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index faef2a02bc7d8..f252e98ffd3af 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -13,8 +13,6 @@ #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h" -void thks_bye(); - #include "virtgpu-shm.h" #define VIRGL_RENDERER_UNSTABLE_APIS 1 From 60bac85a3eb36b28421be80fd31581a9456e8341 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 10:35:06 +0200 Subject: [PATCH 033/117] virtgpu: remove forward call wip code --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 36 +--------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index fbffbb361f016..f7de8a5b66b4e 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -77,43 +77,9 @@ create_virtgpu() { if (ret != 0) { FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret); } - - int32_t forward_flag = 0; - encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } - - int32_t arg1 = 11; - int32_t arg2 = 22; - int32_t arg3 = 33; - - vn_encode_int32_t(encoder, &arg1); - vn_encode_int32_t(encoder, &arg2); - vn_encode_int32_t(encoder, &arg3); - decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } - - int32_t resp1; - int32_t resp2; - int32_t resp3; - int32_t resp4; - vn_decode_int32_t(decoder, &resp1); - vn_decode_int32_t(decoder, &resp2); - vn_decode_int32_t(decoder, &resp3); - vn_decode_int32_t(decoder, &resp4); - INFO("%s: Forward RESP %d %d %d %d", __func__, resp1, resp2, resp3, resp4); - - ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } - - thks_bye(); } + static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu) { From abd176f1acd772d056e9ba9c4640628e772656ea Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 14:18:42 +0200 Subject: [PATCH 034/117] ggml-remotingfrontend: build the apir framework --- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 1 + .../ggml-backend-device.cpp | 110 +++++++++++------- .../ggml-backend-reg.cpp | 40 ++++++- .../src/ggml-remotingfrontend/ggml-remoting.h | 7 ++ .../ggml-remotingfrontend/virtgpu-forward.cpp | 35 ++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 2 + .../ggml-remotingfrontend/virtgpu-utils.cpp | 2 - .../src/ggml-remotingfrontend/virtgpu-utils.h | 2 + ggml/src/ggml-remotingfrontend/virtgpu.cpp | 34 +++++- ggml/src/ggml-remotingfrontend/virtgpu.h | 27 +---- 10 files changed, 185 insertions(+), 75 deletions(-) create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward.h diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 678623f972fc1..df45db51f46b3 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -13,6 +13,7 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu.cpp virtgpu-shm.cpp virtgpu-utils.cpp + virtgpu-forward.cpp ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index b18ce03a37121..70bb6756b315d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -1,81 +1,105 @@ #include "ggml-remoting.h" static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { - UNUSED(dev); - return "API Remoting"; + UNUSED(dev); + + NOT_IMPLEMENTED; + + return "API Remoting"; } static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { - UNUSED(dev); - return "API Remoting device"; + UNUSED(dev); + + NOT_IMPLEMENTED; + + return "API Remoting device"; } static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; + UNUSED(dev); + + NOT_IMPLEMENTED; + + return GGML_BACKEND_DEVICE_TYPE_GPU; } static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - UNUSED(device); - *total = 1024*1024*1024; - *free = *total; + UNUSED(device); + + NOT_IMPLEMENTED; + + *total = 1024*1024*1024; + *free = *total; } static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { UNUSED(dev); UNUSED(op); + //NOT_IMPLEMENTED; // to chatty + return true; } static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - UNUSED(dev); - UNUSED(buft); - return true; + UNUSED(dev); + UNUSED(buft); + + NOT_IMPLEMENTED; + + return true; } static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + const int min_batch_size = 32; + + NOT_IMPLEMENTED; - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - UNUSED(dev); + UNUSED(dev); } static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { - UNUSED(dev); - return ggml_backend_remoting_host_buffer_type(); + UNUSED(dev); + + // NOT_IMPLEMENTED; // too chatty + + return ggml_backend_remoting_host_buffer_type(); } static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_remoting_device_get_name(dev); - props->description = ggml_backend_remoting_device_get_description(dev); - props->type = ggml_backend_remoting_device_get_type(dev); - ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, - /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; + + IMPLEMENTED; + props->name = ggml_backend_remoting_device_get_name(dev); + props->description = ggml_backend_remoting_device_get_description(dev); + props->type = ggml_backend_remoting_device_get_type(dev); + ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; } const struct ggml_backend_device_i ggml_backend_remoting_device_i = { - /* .get_name = */ ggml_backend_remoting_device_get_name, - /* .get_description = */ ggml_backend_remoting_device_get_description, - /* .get_memory = */ ggml_backend_remoting_device_get_memory, - /* .get_type = */ ggml_backend_remoting_device_get_type, - /* .get_props = */ ggml_backend_remoting_device_get_props, - /* .init_backend = */ ggml_backend_remoting_device_init, - /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_remoting_device_supports_op, - /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, - /* .offload_op = */ ggml_backend_remoting_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, + /* .get_name = */ ggml_backend_remoting_device_get_name, + /* .get_description = */ ggml_backend_remoting_device_get_description, + /* .get_memory = */ ggml_backend_remoting_device_get_memory, + /* .get_type = */ ggml_backend_remoting_device_get_type, + /* .get_props = */ ggml_backend_remoting_device_get_props, + /* .init_backend = */ ggml_backend_remoting_device_init, + /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_remoting_device_supports_op, + /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, + /* .offload_op = */ ggml_backend_remoting_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 93f35f7e2e26e..a0d1480508543 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -3,19 +3,53 @@ #include "ggml-remoting.h" -static int ggml_backend_remoting_get_device_count() { +static struct virtgpu *apir_gpu_instance = NULL; + +static int apir_initialize() { + static bool apir_initialized = false; + + if (apir_initialized) { + if (!apir_gpu_instance) { + return 0; + } + return 1; + } + apir_initialized = true; + + apir_gpu_instance = create_virtgpu(); + if (!apir_gpu_instance) { + FATAL("failed to initialize the virtgpu :/"); + return 0; + } + + apir_initialized = true; return 1; } +static int ggml_backend_remoting_get_device_count() { + if (!apir_initialize()) { + WARNING("apir_initialize failed :/"); + return 0; + } + IMPLEMENTED; + + return apir_get_device_count(apir_gpu_instance); +} + static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { UNUSED(reg); + + IMPLEMENTED; + return ggml_backend_remoting_get_device_count(); } static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { static std::vector devices; + IMPLEMENTED; + static bool initialized = false; { @@ -23,8 +57,6 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ std::lock_guard lock(mutex); if (!initialized) { - create_virtgpu(); - for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; char desc[256] = "API Remoting device"; @@ -48,6 +80,8 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { UNUSED(reg); + printf("reached %s\n", __func__); + //thks_bye(); return GGML_REMOTING_FRONTEND_NAME; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 7dcc6641f7574..5a20e371f6cea 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -12,6 +12,12 @@ #define UNUSED GGML_UNUSED +#define NOT_IMPLEMENTED \ + printf("WARN: ### reached unimplemented function %s\n", __func__) + +#define IMPLEMENTED \ + printf("INFO: ### reached implemented function %s\n", __func__) + #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl struct ggml_backend_remoting_device_context { @@ -19,6 +25,7 @@ struct ggml_backend_remoting_device_context { std::string name; std::string description; + struct virtgpu *gpu; }; extern const struct ggml_backend_device_i ggml_backend_remoting_device_i; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp new file mode 100644 index 0000000000000..a445c64929991 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -0,0 +1,35 @@ +#include "virtgpu.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" + +#define CACHED \ + printf("INFO: ### found response in the cache %s\n", __func__) + +int +apir_get_device_count(struct virtgpu *gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return dev_count; + } + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_COUNT; + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + vn_decode_int32_t(decoder, &dev_count); + + INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return dev_count; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h new file mode 100644 index 0000000000000..28d23ededb188 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -0,0 +1,2 @@ +int +apir_get_device_count(struct virtgpu *gpu); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp index f1af0d3391550..cedd31ddaaf9c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -8,8 +8,6 @@ #define NODE_LEVEL_MASK ((uintptr_t)NODE_ALLOC_ALIGN - 1) #define NULL_NODE 0 -void thks_bye(); - #define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align) #define os_free_aligned(_ptr) free(_ptr) #define p_atomic_cmpxchg(v, old, _new) \ diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index 7da90be25c380..b02c3d106f7fe 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -24,6 +24,8 @@ #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) +void thks_bye(); + inline void INFO(const char *format, ...) { va_list argptr; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index f7de8a5b66b4e..679d8fcae6fe6 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -7,6 +7,25 @@ #include "virtgpu.h" +static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); +static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu); + + +static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu); +static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu); + +static int virtgpu_ioctl_context_init(struct virtgpu *gpu, + enum virgl_renderer_capset capset_id); +static int +virtgpu_ioctl_get_caps(struct virtgpu *gpu, + enum virgl_renderer_capset id, + uint32_t version, + void *capset, + size_t capset_size); +static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); +static void virtgpu_init_renderer_info(struct virtgpu *gpu); + static inline void virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) { @@ -33,7 +52,7 @@ virtgpu_init_shmem_blob_mem(struct virtgpu *gpu) gpu->shmem_blob_mem = VIRTGPU_BLOB_MEM_HOST3D; } -void +struct virtgpu * create_virtgpu() { struct virtgpu *gpu = new struct virtgpu(); @@ -64,7 +83,7 @@ create_virtgpu() { struct vn_cs_decoder *decoder; int32_t ret; - encoder = remote_call_prepare(gpu, VIRGL_VK_COMMAND_TYPE_LoadLibrary, 0); + encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_LoadLibrary, 0); if (!encoder) { FATAL("%s: failed to prepare the remote call encoder :/", __func__); } @@ -77,6 +96,8 @@ create_virtgpu() { if (ret != 0) { FATAL("%s: failed to load the APIR backend libraries (code=%d):/", __func__, ret); } + + return gpu; } @@ -352,7 +373,8 @@ virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param) } -static struct vn_cs_encoder *remote_call_prepare( +struct vn_cs_encoder * +remote_call_prepare( struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags) @@ -395,7 +417,8 @@ static struct vn_cs_encoder *remote_call_prepare( return &enc; } -static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +int32_t +remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { if (!enc) { WARNING("Invalid (null) encoder :/"); } @@ -410,7 +433,8 @@ static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decode return remote_call_ret; } -static struct vn_cs_decoder *remote_call( +struct vn_cs_decoder * +remote_call( struct virtgpu *gpu, struct vn_cs_encoder *encoder ) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index f252e98ffd3af..5ab934ec7fb78 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -9,6 +9,7 @@ #include #include +#include "virtgpu-forward.h" #include "virtgpu-utils.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h" @@ -96,29 +97,11 @@ virtgpu_ioctl(struct virtgpu *gpu, unsigned long request, void *args) return drmIoctl(gpu->fd, request, args); } -void create_virtgpu(); -static virt_gpu_result_t virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev); -static virt_gpu_result_t virtgpu_open(struct virtgpu *gpu); +struct virtgpu *create_virtgpu(); - -static virt_gpu_result_t virtgpu_init_params(struct virtgpu *gpu); -static virt_gpu_result_t virtgpu_init_capset(struct virtgpu *gpu); -static virt_gpu_result_t virtgpu_init_context(struct virtgpu *gpu); - -static int virtgpu_ioctl_context_init(struct virtgpu *gpu, - enum virgl_renderer_capset capset_id); -static int -virtgpu_ioctl_get_caps(struct virtgpu *gpu, - enum virgl_renderer_capset id, - uint32_t version, - void *capset, - size_t capset_size); -static uint64_t virtgpu_ioctl_getparam(struct virtgpu *gpu, uint64_t param); -static void virtgpu_init_renderer_info(struct virtgpu *gpu); - -static struct vn_cs_encoder *remote_call_prepare( +struct vn_cs_encoder *remote_call_prepare( struct virtgpu *gpu, int32_t cmd_type, int32_t cmd_flags); -static struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc); -static int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +struct vn_cs_decoder *remote_call(struct virtgpu *gpu, struct vn_cs_encoder *enc); +int32_t remote_call_finish(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); From 747728bea0bbb73ccdebecef74a208bc0980e1f9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 7 May 2025 14:18:42 +0200 Subject: [PATCH 035/117] ggml-remotingbackend: build the apir framework --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 2 + .../backend-dispatched.cpp | 68 ++++++++++++++ .../ggml-remotingbackend/backend-dispatched.h | 30 +++++++ .../ggml-remotingbackend/backend-internal.h | 36 +------- .../ggml-remotingbackend/backend-utils.cpp | 0 ggml/src/ggml-remotingbackend/backend-utils.h | 53 +++++++++++ ggml/src/ggml-remotingbackend/backend.cpp | 89 +++++-------------- .../shared/api_remoting.h | 8 +- .../shared/apir_backend.h | 8 ++ .../ggml-remotingbackend/shared/venus_cs.h | 5 ++ 10 files changed, 192 insertions(+), 107 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched.cpp create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched.h create mode 100644 ggml/src/ggml-remotingbackend/backend-utils.cpp create mode 100644 ggml/src/ggml-remotingbackend/backend-utils.h diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index 420e283fc8359..7435c7726beee 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -5,6 +5,8 @@ message(STATUS "Enable API Remoting backend") ggml_add_backend_library(ggml-remotingbackend backend.cpp + backend-dispatched.cpp + backend-utils.cpp shared/api_remoting.h shared/apir_backend.h shared/venus_cs.h diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp new file mode 100644 index 0000000000000..d6ff3421a5f6c --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -0,0 +1,68 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-remoting-backend.h" + +static ggml_backend_reg_t reg = NULL; + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) { + if (reg != NULL) { + FATAL("%s: already initialized :/", __func__); + } + ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; + + reg = ggml_backend_reg_fct(); + + return APIR_BACKEND_INITIALIZE_SUCCESSS; + +} + +static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return 0; +} + +static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + + return GGML_REMOTING_BACKEND_NAME; +} + +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + UNUSED(device); + + return NULL; +} + +static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { + /* .get_name = */ ggml_backend_remoting_reg_get_name, + /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, + /* .get_device = */ ggml_backend_remoting_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_remoting_backend_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_remoting_reg_i, + /* .context = */ nullptr, + }; + + INFO("%s, hello :wave:", __func__); + + return ® +} + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + vn_encode_int32_t(enc, &dev_count); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h new file mode 100644 index 0000000000000..32d9ae2a140c5 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +#include + +#include "backend-utils.h" +#include "shared/venus_cs.h" +#include "shared/apir_backend.h" + +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p); + +typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); + +/* *** */ + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); + +static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) +{ + switch (type) { + case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count"; + default: return "unknown"; + } +} + +static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { + [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count, +}; diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h index e6c098ed95175..8828f08aa1052 100644 --- a/ggml/src/ggml-remotingbackend/backend-internal.h +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -1,40 +1,6 @@ #include #include - -static inline void INFO(const char* fmt, ...) { - printf("INFO: "); - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); - - printf("\n"); -} - -static inline void ERROR(const char* fmt, ...) { - printf("ERROR: "); - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); - - printf("\n"); -} - -static inline void FATAL(const char* fmt, ...) { - printf("FATAL: "); - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - va_end(args); - - printf("\n"); - - if (!fmt) - return; // avoid the noreturn attribute - - exit(1); -} +#include extern "C" { uint32_t apir_backend_initialize(); diff --git a/ggml/src/ggml-remotingbackend/backend-utils.cpp b/ggml/src/ggml-remotingbackend/backend-utils.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ggml/src/ggml-remotingbackend/backend-utils.h b/ggml/src/ggml-remotingbackend/backend-utils.h new file mode 100644 index 0000000000000..b032061a96947 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-utils.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include + +#include + +#define UNUSED GGML_UNUSED + +inline void +INFO(const char *format, ...) { + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +WARNING(const char *format, ...) { + fprintf(stderr, "WARNING: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +ERROR(const char *format, ...) { + fprintf(stderr, "ERROR: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); +} + +inline void +FATAL(const char *format, ...) { + fprintf(stderr, "FATAL: "); + + va_list argptr; + va_start(argptr, format); + vfprintf(stderr, format, argptr); + fprintf(stderr, "\n"); + va_end(argptr); + if (format) + assert(false); +} diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index d858b033e3c9d..7cf24471a752e 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -1,60 +1,21 @@ #include #include -#include "ggml-remoting-backend.h" - -#include "ggml-impl.h" -#include "ggml-backend-impl.h" -#include "ggml-backend.h" +#include +#include "backend-utils.h" #include "backend-internal.h" +#include "backend-dispatched.h" + #include "shared/apir_backend.h" #include "shared/venus_cs.h" -#define UNUSED GGML_UNUSED - -static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { - UNUSED(reg); - return 0; -} - -static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { - UNUSED(reg); - return GGML_REMOTING_BACKEND_NAME; -} - -static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { - UNUSED(reg); - UNUSED(device); - - return NULL; -} - -static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { - /* .get_name = */ ggml_backend_remoting_reg_get_name, - /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, - /* .get_device = */ ggml_backend_remoting_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_remoting_backend_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_remoting_reg_i, - /* .context = */ nullptr, - }; +#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" +#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg" - INFO("%s, hello :wave:", __func__); - - return ® -} - -typedef ggml_backend_reg_t (*backend_reg_fct_t)(void); +static void *backend_library_handle = NULL; -#define GGML_BACKEND_METAL_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" -#define GGML_BACKEND_METAL_REG_FCT_NAME "ggml_backend_metal_reg" -static void *backend_library_handle = NULL; extern "C" { void apir_backend_deinit(void) { @@ -69,7 +30,7 @@ extern "C" { uint32_t apir_backend_initialize() { INFO("%s: hello :wave: \\o/", __func__); - backend_library_handle = dlopen(GGML_BACKEND_METAL_LIBRARY_PATH, RTLD_LAZY); + backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY); if (!backend_library_handle) { ERROR("Cannot open library: %s\n", dlerror()); @@ -77,7 +38,7 @@ extern "C" { return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; } - backend_reg_fct_t entrypoint_fct = (backend_reg_fct_t) dlsym(backend_library_handle, GGML_BACKEND_METAL_REG_FCT_NAME); + void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME); const char* dlsym_error = dlerror(); if (dlsym_error) { ERROR("Cannot load symbol: %s\n", dlsym_error); @@ -85,10 +46,7 @@ extern "C" { return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } - ggml_backend_reg_t reg = entrypoint_fct(); - INFO("%s: --> %s", __func__, reg->iface.get_name(reg)); - - return APIR_BACKEND_INITIALIZE_SUCCESSS; + return backend_dispatch_initialize(ggml_backend_reg_fct); } uint32_t apir_backend_dispatcher(uint32_t cmd_type, @@ -109,23 +67,18 @@ extern "C" { }; struct vn_cs_decoder *dec = &_dec; - int32_t arg1, arg2, arg3; - vn_decode_int32_t(dec, &arg1); - vn_decode_int32_t(dec, &arg2); - vn_decode_int32_t(dec, &arg3); - - INFO("%s: ARGS %d %d %d\n", __func__, arg1, arg2, arg3); - - int32_t resp1 = 1; - int32_t resp2 = 2; - int32_t resp3 = 3; - int32_t resp4 = 4; - vn_encode_int32_t(enc, &resp1); - vn_encode_int32_t(enc, &resp2); - vn_encode_int32_t(enc, &resp3); - vn_encode_int32_t(enc, &resp4); + + if (cmd_type > APIR_BACKEND_DISPATCH_TABLE_COUNT) { + ERROR("Received an invalid dispatch index (%d > %d)\n", + cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + return APIR_BACKEND_FORWARD_INDEX_INVALID; + } + + backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; + uint32_t ret = forward_fct(enc, dec); + *enc_cur_after = enc->cur; - return 0; + return ret; } } diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h index 0cac78cccdfda..1df5498c29c03 100644 --- a/ggml/src/ggml-remotingbackend/shared/api_remoting.h +++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h @@ -1,13 +1,13 @@ -#define VIRGL_VK_COMMAND_TYPE_LoadLibrary 255 -#define VIRGL_VK_COMMAND_TYPE_Forward 256 +#define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255 +#define VIRGL_APIR_COMMAND_TYPE_Forward 256 static inline const char *api_remoting_command_name(int32_t type) { switch (type) { - case VIRGL_VK_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; - case VIRGL_VK_COMMAND_TYPE_Forward: return "Forward"; + case VIRGL_APIR_COMMAND_TYPE_LoadLibrary: return "LoadLibrary"; + case VIRGL_APIR_COMMAND_TYPE_Forward: return "Forward"; default: return "unknown"; } } diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 8506ffa46b759..c5a9dbd05e8dd 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -11,6 +11,8 @@ #define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 #define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 +#define APIR_BACKEND_FORWARD_INDEX_INVALID 5 + typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); @@ -19,3 +21,9 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, char *enc_cur, const char *enc_end, char **enc_cur_after ); + +typedef enum ApirBackendCommandType { + APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0, +} ApirBackendCommandType; + +#define APIR_BACKEND_DISPATCH_TABLE_COUNT 1 // last command_type index + 1 diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index d9397c6d5d647..5a3ed16ad4100 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -1,5 +1,10 @@ #pragma once +#include +#include + +// needs FATAL to be defined + #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) From 00be43f69276646391100bf66d41fb19a8f8a52d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 11:45:47 +0200 Subject: [PATCH 036/117] Add support for device name and description --- .../backend-dispatched.cpp | 30 +++ .../ggml-remotingbackend/backend-dispatched.h | 6 + .../shared/apir_backend.h | 6 +- .../ggml-remotingbackend/shared/venus_cs.h | 241 ++++++++++++------ .../ggml-backend-device.cpp | 16 +- .../ggml-backend-reg.cpp | 52 ++-- .../src/ggml-remotingfrontend/ggml-remoting.h | 6 +- .../ggml-remotingfrontend/virtgpu-forward.cpp | 72 ++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 5 +- .../src/ggml-remotingfrontend/virtgpu-utils.h | 4 + 10 files changed, 319 insertions(+), 119 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index d6ff3421a5f6c..9cee43e751ca4 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -8,6 +8,7 @@ #include "ggml-remoting-backend.h" static ggml_backend_reg_t reg = NULL; +static ggml_backend_dev_t dev = NULL; uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) { if (reg != NULL) { @@ -16,6 +17,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) { ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; reg = ggml_backend_reg_fct(); + if (reg->iface.get_device_count(reg)) { + dev = reg->iface.get_device(reg, 0); + } return APIR_BACKEND_INITIALIZE_SUCCESSS; @@ -66,3 +70,29 @@ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_de return 0; } + +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + const char *string = dev->iface.get_name(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + + +uint32_t +backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + const char *string = dev->iface.get_description(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 32d9ae2a140c5..39a1d2ffa9881 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -16,15 +16,21 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d /* *** */ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { switch (type) { case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count"; + case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name"; + case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description"; default: return "unknown"; } } static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count, + [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name, + [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index c5a9dbd05e8dd..f8183c8f0f731 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -24,6 +24,8 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0, -} ApirBackendCommandType; + APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1, + APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2, -#define APIR_BACKEND_DISPATCH_TABLE_COUNT 1 // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 3, // last command_type index + 1 +} ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 5a3ed16ad4100..ebcab98a449f4 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -3,6 +3,7 @@ #include #include +// needs UNUSED to be defined // needs FATAL to be defined #define likely(x) __builtin_expect(!!(x), 1) @@ -29,18 +30,18 @@ vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, void *val, size_t val_size) { - assert(val_size <= size); + assert(val_size <= size); - if (unlikely(size > (size_t) (dec->end - dec->cur))) { - FATAL("DECODER IS FULL :/"); - //vn_cs_decoder_set_fatal(dec); - memset(val, 0, val_size); - return false; - } + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("DECODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + memset(val, 0, val_size); + return false; + } - /* we should not rely on the compiler to optimize away memcpy... */ - memcpy(val, dec->cur, val_size); - return true; + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(val, dec->cur, val_size); + return true; } static inline void @@ -49,7 +50,7 @@ vn_cs_decoder_peek(const struct vn_cs_decoder *dec, void *val, size_t val_size) { - vn_cs_decoder_peek_internal(dec, size, val, val_size); + vn_cs_decoder_peek_internal(dec, size, val, val_size); } /* @@ -62,8 +63,8 @@ vn_cs_decoder_read(struct vn_cs_decoder *dec, void *val, size_t val_size) { - if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) - dec->cur += size; + if (vn_cs_decoder_peek_internal(dec, size, val, val_size)) + dec->cur += size; } static inline void @@ -72,12 +73,12 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc, const void *val, size_t val_size) { - assert(val_size <= size); - assert(size <= ((size_t) (enc->end - enc->cur))); + assert(val_size <= size); + assert(size <= ((size_t) (enc->end - enc->cur))); - /* we should not rely on the compiler to optimize away memcpy... */ - memcpy(enc->cur, val, val_size); - enc->cur += size; + /* we should not rely on the compiler to optimize away memcpy... */ + memcpy(enc->cur, val, val_size); + enc->cur += size; } /* @@ -87,16 +88,16 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc, static inline void vn_decode(struct vn_cs_decoder *dec, size_t size, void *data, size_t data_size) { - assert(size % 4 == 0); - vn_cs_decoder_read(dec, size, data, data_size); + assert(size % 4 == 0); + vn_cs_decoder_read(dec, size, data, data_size); } static inline void vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_size) { - assert(size % 4 == 0); - /* TODO check if the generated code is optimal */ - vn_cs_encoder_write(enc, size, data, data_size); + assert(size % 4 == 0); + /* TODO check if the generated code is optimal */ + vn_cs_encoder_write(enc, size, data, data_size); } /* @@ -108,45 +109,45 @@ vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_ static inline size_t vn_sizeof_uint64_t(const uint64_t *val) { - assert(sizeof(*val) == 8); - return 8; + assert(sizeof(*val) == 8); + return 8; } static inline void vn_encode_uint64_t(struct vn_cs_encoder *enc, const uint64_t *val) { - vn_encode(enc, 8, val, sizeof(*val)); + vn_encode(enc, 8, val, sizeof(*val)); } static inline void vn_decode_uint64_t(struct vn_cs_decoder *dec, uint64_t *val) { - vn_decode(dec, 8, val, sizeof(*val)); + vn_decode(dec, 8, val, sizeof(*val)); } static inline size_t vn_sizeof_uint64_t_array(const uint64_t *val, uint32_t count) { - assert(sizeof(*val) == 8); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; + assert(sizeof(*val) == 8); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; } static inline void vn_encode_uint64_t_array(struct vn_cs_encoder *enc, const uint64_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); } static inline void vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); } /* int32_t */ @@ -154,45 +155,45 @@ vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t coun static inline size_t vn_sizeof_int32_t(const int32_t *val) { - assert(sizeof(*val) == 4); - return 4; + assert(sizeof(*val) == 4); + return 4; } static inline void vn_encode_int32_t(struct vn_cs_encoder *enc, const int32_t *val) { - vn_encode(enc, 4, val, sizeof(*val)); + vn_encode(enc, 4, val, sizeof(*val)); } static inline void vn_decode_int32_t(struct vn_cs_decoder *dec, int32_t *val) { - vn_decode(dec, 4, val, sizeof(*val)); + vn_decode(dec, 4, val, sizeof(*val)); } static inline size_t vn_sizeof_int32_t_array(const int32_t *val, uint32_t count) { - assert(sizeof(*val) == 4); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; } static inline void vn_encode_int32_t_array(struct vn_cs_encoder *enc, const int32_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); } static inline void vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); } /* array size (uint64_t) */ @@ -200,42 +201,42 @@ vn_decode_int32_t_array(struct vn_cs_decoder *dec, int32_t *val, uint32_t count) static inline size_t vn_sizeof_array_size(uint64_t size) { - return vn_sizeof_uint64_t(&size); + return vn_sizeof_uint64_t(&size); } static inline void vn_encode_array_size(struct vn_cs_encoder *enc, uint64_t size) { - vn_encode_uint64_t(enc, &size); + vn_encode_uint64_t(enc, &size); } static inline uint64_t vn_decode_array_size(struct vn_cs_decoder *dec, uint64_t expected_size) { - uint64_t size; - vn_decode_uint64_t(dec, &size); - if (size != expected_size) { - FATAL("ENCODER IS FULL :/"); - //vn_cs_decoder_set_fatal(dec); - size = 0; - } - return size; + uint64_t size; + vn_decode_uint64_t(dec, &size); + if (size != expected_size) { + FATAL("ENCODER IS FULL :/"); + //vn_cs_decoder_set_fatal(dec); + size = 0; + } + return size; } static inline uint64_t vn_decode_array_size_unchecked(struct vn_cs_decoder *dec) { - uint64_t size; - vn_decode_uint64_t(dec, &size); - return size; + uint64_t size; + vn_decode_uint64_t(dec, &size); + return size; } static inline uint64_t vn_peek_array_size(struct vn_cs_decoder *dec) { - uint64_t size; - vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); - return size; + uint64_t size; + vn_cs_decoder_peek(dec, sizeof(size), &size, sizeof(size)); + return size; } /* non-array pointer */ @@ -243,20 +244,20 @@ vn_peek_array_size(struct vn_cs_decoder *dec) static inline size_t vn_sizeof_simple_pointer(const void *val) { - return vn_sizeof_array_size(val ? 1 : 0); + return vn_sizeof_array_size(val ? 1 : 0); } static inline bool vn_encode_simple_pointer(struct vn_cs_encoder *enc, const void *val) { - vn_encode_array_size(enc, val ? 1 : 0); - return val; + vn_encode_array_size(enc, val ? 1 : 0); + return val; } static inline bool vn_decode_simple_pointer(struct vn_cs_decoder *dec) { - return vn_decode_array_size_unchecked(dec); + return vn_decode_array_size_unchecked(dec); } /* uint32_t */ @@ -264,43 +265,113 @@ vn_decode_simple_pointer(struct vn_cs_decoder *dec) static inline size_t vn_sizeof_uint32_t(const uint32_t *val) { - assert(sizeof(*val) == 4); - return 4; + assert(sizeof(*val) == 4); + return 4; } static inline void vn_encode_uint32_t(struct vn_cs_encoder *enc, const uint32_t *val) { - vn_encode(enc, 4, val, sizeof(*val)); + vn_encode(enc, 4, val, sizeof(*val)); } static inline void vn_decode_uint32_t(struct vn_cs_decoder *dec, uint32_t *val) { - vn_decode(dec, 4, val, sizeof(*val)); + vn_decode(dec, 4, val, sizeof(*val)); } static inline size_t vn_sizeof_uint32_t_array(const uint32_t *val, uint32_t count) { - assert(sizeof(*val) == 4); - const size_t size = sizeof(*val) * count; - assert(size >= count); - return size; + assert(sizeof(*val) == 4); + const size_t size = sizeof(*val) * count; + assert(size >= count); + return size; } static inline void vn_encode_uint32_t_array(struct vn_cs_encoder *enc, const uint32_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_encode(enc, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_encode(enc, size, val, size); } static inline void vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t count) { - const size_t size = sizeof(*val) * count; - assert(size >= count); - vn_decode(dec, size, val, size); + const size_t size = sizeof(*val) * count; + assert(size >= count); + vn_decode(dec, size, val, size); +} + +/* opaque blob */ + +static inline size_t +vn_sizeof_blob_array(const void *val, size_t size) +{ + UNUSED(val); + return (size + 3) & ~3; +} + +static inline void +vn_encode_blob_array(struct vn_cs_encoder *enc, const void *val, size_t size) +{ + vn_encode(enc, (size + 3) & ~3, val, size); +} + +static inline void +vn_decode_blob_array(struct vn_cs_decoder *dec, void *val, size_t size) +{ + vn_decode(dec, (size + 3) & ~3, val, size); +} + +/* string */ + +static inline size_t +vn_sizeof_char_array(const char *val, size_t size) +{ + return vn_sizeof_blob_array(val, size); +} + +static inline void +vn_encode_char_array(struct vn_cs_encoder *enc, const char *val, size_t size) +{ + assert(size && strlen(val) < size); + vn_encode_blob_array(enc, val, size); +} + +static inline void +vn_decode_char_array(struct vn_cs_decoder *dec, char *val, size_t size) +{ + vn_decode_blob_array(dec, val, size); + if (size) + val[size - 1] = '\0'; + else { + //vn_cs_decoder_set_fatal(dec); + FATAL("Couldn't decode the blog array"); + } +} + +/* (temp) buffer allocation */ + +static inline void * +vkr_cs_decoder_alloc_array(struct vkr_cs_decoder *dec, size_t size, size_t count) +{ + UNUSED(dec); + size_t alloc_size; + if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { + FATAL("overflow in array allocation of %zu * %zu bytes", size, count); + return NULL; + } + + return malloc(alloc_size); +} + +static inline void * +vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count) +{ + struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec; + return vkr_cs_decoder_alloc_array(d, size, count); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 70bb6756b315d..f84e6bd1d2f03 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -1,19 +1,19 @@ #include "ggml-remoting.h" -static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { - UNUSED(dev); +static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { + IMPLEMENTED; - NOT_IMPLEMENTED; + struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return "API Remoting"; + return apir_get_device_name(gpu); } -static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { - UNUSED(dev); +static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { + IMPLEMENTED; - NOT_IMPLEMENTED; + struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return "API Remoting device"; + return apir_get_device_description(gpu); } static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index a0d1480508543..216c69ced375b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -3,38 +3,36 @@ #include "ggml-remoting.h" -static struct virtgpu *apir_gpu_instance = NULL; - -static int apir_initialize() { +static struct virtgpu *apir_initialize() { + static struct virtgpu *apir_gpu_instance = NULL; static bool apir_initialized = false; if (apir_initialized) { - if (!apir_gpu_instance) { - return 0; - } - return 1; + return apir_gpu_instance; } apir_initialized = true; apir_gpu_instance = create_virtgpu(); if (!apir_gpu_instance) { FATAL("failed to initialize the virtgpu :/"); - return 0; + return NULL; } apir_initialized = true; - return 1; + return apir_gpu_instance; } static int ggml_backend_remoting_get_device_count() { - if (!apir_initialize()) { + IMPLEMENTED; + + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { WARNING("apir_initialize failed :/"); return 0; } - IMPLEMENTED; - return apir_get_device_count(apir_gpu_instance); + return apir_get_device_count(gpu); } static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { @@ -42,7 +40,13 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) IMPLEMENTED; - return ggml_backend_remoting_get_device_count(); + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + WARNING("apir_initialize failed :/"); + return 0; + } + + return apir_get_device_count(gpu); } static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { @@ -50,6 +54,12 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ IMPLEMENTED; + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + WARNING("apir_initialize failed :/"); + return 0; + } + static bool initialized = false; { @@ -58,12 +68,14 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ if (!initialized) { for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { - ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; + ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context; char desc[256] = "API Remoting device"; ctx->device = i; ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); ctx->description = desc; + ctx->gpu = gpu; + devices.push_back(new ggml_backend_device { /* .iface = */ ggml_backend_remoting_device_i, /* .reg = */ reg, @@ -78,10 +90,9 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ return devices[device]; } -static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { +static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { UNUSED(reg); - printf("reached %s\n", __func__); - //thks_bye(); + return GGML_REMOTING_FRONTEND_NAME; } @@ -93,10 +104,15 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { }; ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { + struct virtgpu *gpu = apir_initialize(); + if (!gpu) { + FATAL("apir_initialize failed :/"); + return NULL; + } static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, /* .iface = */ ggml_backend_remoting_reg_i, - /* .context = */ nullptr, + /* .context = */ gpu, }; RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 5a20e371f6cea..c314623d809ab 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -10,13 +10,11 @@ #include "ggml-backend.h" #include "virtgpu.h" -#define UNUSED GGML_UNUSED - #define NOT_IMPLEMENTED \ printf("WARN: ### reached unimplemented function %s\n", __func__) -#define IMPLEMENTED \ - printf("INFO: ### reached implemented function %s\n", __func__) +#define IMPLEMENTED +// printf("INFO: ### reached implemented function %s\n", __func__) #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp index a445c64929991..04167a676e9a7 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -33,3 +33,75 @@ apir_get_device_count(struct virtgpu *gpu) { return dev_count; } + + +const char * +apir_get_device_name(struct virtgpu *gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return "Nothing"; + } + + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_NAME; + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward DEV NAME --> %s", __func__, string); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return string; +} + +const char * +apir_get_device_description(struct virtgpu *gpu) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return "Nothing"; + } + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION; + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device description buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward DEV DESCR --> %s", __func__, string); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return string; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 28d23ededb188..383fd2ea5a642 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -1,2 +1,3 @@ -int -apir_get_device_count(struct virtgpu *gpu); +int apir_get_device_count(struct virtgpu *gpu); +const char *apir_get_device_name(struct virtgpu *gpu); +const char *apir_get_device_description(struct virtgpu *gpu); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index b02c3d106f7fe..a6bd5df92ea6f 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -13,6 +13,10 @@ #define unlikely(x) __builtin_expect(!!(x), 0) #define likely(x) __builtin_expect(!!(x), 1) +#ifndef UNUSED +#define UNUSED(x) (void)(x) +#endif + /** Checks is a value is a power of two. Does not handle zero. */ #define IS_POT(v) (((v) & ((v) - 1)) == 0) From 3dd26d10c6c29c0df5c52b0226f00a4302443bb1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:28:35 +0200 Subject: [PATCH 037/117] ggml: src: ggml-metal/ggml-metal: make less verbose --- ggml/src/ggml-metal/ggml-metal.m | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f226826020a5a..97f426cbd3e13 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -8,6 +8,9 @@ #import +#undef GGML_LOG_DEBUG +#define GGML_LOG_DEBUG(...) + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -776,8 +779,6 @@ @implementation GGMLMetalClass GGML_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } \ - } else { \ - GGML_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } const bool has_simdgroup_mm = ctx_dev->has_simdgroup_mm; From 11f65c5f42ae3cf94707b260d18cdd08a6fd8f96 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:30:20 +0200 Subject: [PATCH 038/117] ggml-remotingbackend: include the ggml backend initialization --- .../backend-dispatched.cpp | 18 ++++++++++++++++-- .../ggml-remotingbackend/backend-dispatched.h | 2 +- ggml/src/ggml-remotingbackend/backend.cpp | 15 +++++++++++++-- .../ggml-remotingbackend/shared/apir_backend.h | 3 ++- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index 9cee43e751ca4..f6849ccf58c3b 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -7,22 +7,36 @@ #include "ggml-backend.h" #include "ggml-remoting-backend.h" +#include "ggml-metal.h" + static ggml_backend_reg_t reg = NULL; static ggml_backend_dev_t dev = NULL; +static ggml_backend_t bck = NULL; -uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p) { +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) { if (reg != NULL) { FATAL("%s: already initialized :/", __func__); } ggml_backend_reg_t (* ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; reg = ggml_backend_reg_fct(); + if (reg == NULL) { + FATAL("%s: backend registration failed :/", __func__); + } + if (reg->iface.get_device_count(reg)) { dev = reg->iface.get_device(reg, 0); } - return APIR_BACKEND_INITIALIZE_SUCCESSS; + ggml_backend_t (* ggml_backend_fct)(void) = (ggml_backend_t (*)()) ggml_backend_init_fct_p; + bck = ggml_backend_fct(); + if (!bck) { + ERROR("%s: backend initialization failed :/", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_FAILED; + } + + return APIR_BACKEND_INITIALIZE_SUCCESSS; } static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 39a1d2ffa9881..86c8c7618861b 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -9,7 +9,7 @@ #include "shared/venus_cs.h" #include "shared/apir_backend.h" -uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p); +uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 7cf24471a752e..4bafac5c28e9a 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -12,6 +12,7 @@ #define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" #define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg" +#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init" static void *backend_library_handle = NULL; @@ -28,6 +29,8 @@ extern "C" { } uint32_t apir_backend_initialize() { + const char* dlsym_error; + INFO("%s: hello :wave: \\o/", __func__); backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY); @@ -39,14 +42,22 @@ extern "C" { } void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME); - const char* dlsym_error = dlerror(); + dlsym_error = dlerror(); + if (dlsym_error) { + ERROR("Cannot load symbol: %s\n", dlsym_error); + + return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; + } + + void *ggml_backend_init_fct = dlsym(backend_library_handle, GGML_BACKEND_INIT_FCT_NAME); + dlsym_error = dlerror(); if (dlsym_error) { ERROR("Cannot load symbol: %s\n", dlsym_error); return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } - return backend_dispatch_initialize(ggml_backend_reg_fct); + return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); } uint32_t apir_backend_dispatcher(uint32_t cmd_type, diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index f8183c8f0f731..08050cfc18c92 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -10,8 +10,9 @@ #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 #define APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS 3 #define APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS 4 +#define APIR_BACKEND_INITIALIZE_BACKEND_FAILED 5 -#define APIR_BACKEND_FORWARD_INDEX_INVALID 5 +#define APIR_BACKEND_FORWARD_INDEX_INVALID 6 typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); From f9a01ef01efe2cb423cf5300baf0c647104f83ae Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:31:16 +0200 Subject: [PATCH 039/117] remoting: include device_get_type and device_get_memory --- .../backend-dispatched.cpp | 24 +++++- .../ggml-remotingbackend/backend-dispatched.h | 6 ++ .../shared/apir_backend.h | 4 +- .../ggml-backend-device.cpp | 15 ++-- .../ggml-remotingfrontend/virtgpu-forward.cpp | 74 +++++++++++++++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 2 + 6 files changed, 115 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index f6849ccf58c3b..d00a015c99d61 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -97,7 +97,6 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder return 0; } - uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { UNUSED(dec); @@ -110,3 +109,26 @@ backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder * return 0; } + +uint32_t +backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + uint32_t type = dev->iface.get_type(dev); + vn_encode_uint32_t(enc, &type); + + return 0; +} + +uint32_t +backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + + vn_encode_size_t(enc, &free); + vn_encode_size_t(enc, &total); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 86c8c7618861b..beeec4ee566fe 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -18,6 +18,8 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { @@ -25,6 +27,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count"; case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name"; case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description"; + case APIR_COMMAND_TYPE_GET_DEVICE_TYPE: return "backend_reg__get_device_type"; + case APIR_COMMAND_TYPE_GET_DEVICE_MEMORY: return "backend_reg__get_device_memory"; default: return "unknown"; } } @@ -33,4 +37,6 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count, [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name, [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description, + [APIR_COMMAND_TYPE_GET_DEVICE_TYPE] = backend_device_get_type, + [APIR_COMMAND_TYPE_GET_DEVICE_MEMORY] = backend_device_get_memory, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 08050cfc18c92..8733b53611502 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -27,6 +27,8 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0, APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1, APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2, + APIR_COMMAND_TYPE_GET_DEVICE_TYPE = 3, + APIR_COMMAND_TYPE_GET_DEVICE_MEMORY = 4, - APIR_BACKEND_DISPATCH_TABLE_COUNT = 3, // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1 } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index f84e6bd1d2f03..55093ae246506 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -17,20 +17,19 @@ static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev } static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { - UNUSED(dev); + IMPLEMENTED; - NOT_IMPLEMENTED; + struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return GGML_BACKEND_DEVICE_TYPE_GPU; + return (enum ggml_backend_dev_type) apir_get_device_type(gpu); } -static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { - UNUSED(device); +static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + IMPLEMENTED; - NOT_IMPLEMENTED; + struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - *total = 1024*1024*1024; - *free = *total; + return apir_get_device_memory(gpu, free, total); } static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp index 04167a676e9a7..617299541f148 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -105,3 +105,77 @@ apir_get_device_description(struct virtgpu *gpu) { return string; } + +uint32_t +apir_get_device_type(struct virtgpu *gpu) { + static uint32_t dev_type = 255; + if (dev_type != 255) { + CACHED; + return dev_type; + } + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_TYPE; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + vn_decode_uint32_t(decoder, &dev_type); + + INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return dev_type; +} + +void +apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) { + static size_t dev_free = 0; + static size_t dev_total = 0; + /* + if (dev_total != 0) { + WARNING("Not sure if llama.cpp expects fresh information for the free memory ..."); + *free = dev_free; + *total = dev_total; + + CACHED; + return; + } + */ + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_MEMORY; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + vn_decode_size_t(decoder, &dev_free); + vn_decode_size_t(decoder, &dev_total); + + *free = dev_free; + *total = dev_total; + + INFO("%s: Forward DEV FREE mem --> %zu MB", __func__, dev_free / 1024 / 1024); + INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 383fd2ea5a642..13b523b2d3fbf 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -1,3 +1,5 @@ int apir_get_device_count(struct virtgpu *gpu); const char *apir_get_device_name(struct virtgpu *gpu); const char *apir_get_device_description(struct virtgpu *gpu); +uint32_t apir_get_device_type(struct virtgpu *gpu); +void apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total); From 2461dc9194545e8a140a3f0b6397c37518e35ec3 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:31:45 +0200 Subject: [PATCH 040/117] ggml: src: ggml-remotingbackend/backend: make less verbose --- ggml/src/ggml-remotingbackend/backend.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 4bafac5c28e9a..9a97b97a71f7c 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -64,8 +64,6 @@ extern "C" { char *dec_cur, const char *dec_end, char *enc_cur, const char *enc_end, char **enc_cur_after) { - INFO("%s: --> %d | %p | %p ", __func__, cmd_type, dec_cur, enc_cur); - struct vn_cs_encoder _enc = { .cur = enc_cur, .end = enc_end, From 9ba6e061860fc37377bb9decd5553332c61f74a0 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:32:06 +0200 Subject: [PATCH 041/117] shared: venus_cs: add more CS functions --- .../ggml-remotingbackend/shared/venus_cs.h | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index ebcab98a449f4..bb9cc99b7262c 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -306,6 +306,57 @@ vn_decode_uint32_t_array(struct vn_cs_decoder *dec, uint32_t *val, uint32_t coun vn_decode(dec, size, val, size); } +/* size_t */ + +static inline size_t +vn_sizeof_size_t(const size_t *val) +{ + return sizeof(*val); +} + +static inline void +vn_encode_size_t(struct vn_cs_encoder *enc, const size_t *val) +{ + const uint64_t tmp = *val; + vn_encode_uint64_t(enc, &tmp); +} + +static inline void +vn_decode_size_t(struct vn_cs_decoder *dec, size_t *val) +{ + uint64_t tmp; + vn_decode_uint64_t(dec, &tmp); + *val = tmp; +} + +static inline size_t +vn_sizeof_size_t_array(const size_t *val, uint32_t count) +{ + return vn_sizeof_size_t(val) * count; +} + +static inline void +vn_encode_size_t_array(struct vn_cs_encoder *enc, const size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_encode_uint64_t_array(enc, (const uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_encode_size_t(enc, &val[i]); + } +} + +static inline void +vn_decode_size_t_array(struct vn_cs_decoder *dec, size_t *val, uint32_t count) +{ + if (sizeof(size_t) == sizeof(uint64_t)) { + vn_decode_uint64_t_array(dec, (uint64_t *)val, count); + } else { + for (uint32_t i = 0; i < count; i++) + vn_decode_size_t(dec, &val[i]); + } +} + /* opaque blob */ static inline size_t From 9d523959b4e7c8598f5dfeca4c13190593bf9c08 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:32:29 +0200 Subject: [PATCH 042/117] ggml: src: ggml-remotingfrontend/ggml-remoting: make the NOT_IMPLEMENTED warning more visible --- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index c314623d809ab..986caef3f407a 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -10,8 +10,14 @@ #include "ggml-backend.h" #include "virtgpu.h" -#define NOT_IMPLEMENTED \ - printf("WARN: ### reached unimplemented function %s\n", __func__) +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) #define IMPLEMENTED // printf("INFO: ### reached implemented function %s\n", __func__) From 95ccc1a0276b485e3b03a7b7e9bc6110e5e03e77 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 13:32:44 +0200 Subject: [PATCH 043/117] ggml: src: ggml-remotingfrontend/virtgpu-forward: make less verbose --- ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp index 617299541f148..59739cb0ff30f 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -1,8 +1,8 @@ #include "virtgpu.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" -#define CACHED \ - printf("INFO: ### found response in the cache %s\n", __func__) +#define CACHED +// printf("INFO: ### found response in the cache %s\n", __func__) int apir_get_device_count(struct virtgpu *gpu) { From ad578113ce1164bc880c2d8c7a47646de97abbf5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 14:12:26 +0200 Subject: [PATCH 044/117] remoting: correct the device_get_* name order --- .../ggml-remotingbackend/backend-dispatched.h | 20 +++++++++---------- .../shared/apir_backend.h | 10 +++++----- .../ggml-backend-device.cpp | 10 ++++------ .../ggml-backend-reg.cpp | 4 ++-- .../ggml-remotingfrontend/virtgpu-forward.cpp | 20 +++++++++---------- .../ggml-remotingfrontend/virtgpu-forward.h | 10 +++++----- 6 files changed, 36 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index beeec4ee566fe..6026b9537a1e6 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -24,19 +24,19 @@ uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decod static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { switch (type) { - case APIR_COMMAND_TYPE_GET_DEVICE_COUNT: return "backend_reg__get_device_count"; - case APIR_COMMAND_TYPE_GET_DEVICE_NAME: return "backend_reg__get_device_name"; - case APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION: return "backend_reg__get_device_description"; - case APIR_COMMAND_TYPE_GET_DEVICE_TYPE: return "backend_reg__get_device_type"; - case APIR_COMMAND_TYPE_GET_DEVICE_MEMORY: return "backend_reg__get_device_memory"; + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; default: return "unknown"; } } static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { - [APIR_COMMAND_TYPE_GET_DEVICE_COUNT] = backend_reg_get_device_count, - [APIR_COMMAND_TYPE_GET_DEVICE_NAME] = backend_device_get_name, - [APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION] = backend_device_get_description, - [APIR_COMMAND_TYPE_GET_DEVICE_TYPE] = backend_device_get_type, - [APIR_COMMAND_TYPE_GET_DEVICE_MEMORY] = backend_device_get_memory, + [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count, + [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name, + [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, + [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, + [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 8733b53611502..4eb7816ce8ed0 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -24,11 +24,11 @@ typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, ); typedef enum ApirBackendCommandType { - APIR_COMMAND_TYPE_GET_DEVICE_COUNT = 0, - APIR_COMMAND_TYPE_GET_DEVICE_NAME = 1, - APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION = 2, - APIR_COMMAND_TYPE_GET_DEVICE_TYPE = 3, - APIR_COMMAND_TYPE_GET_DEVICE_MEMORY = 4, + APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, + APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1, + APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, + APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, + APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1 } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 55093ae246506..47227e63d97e4 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -5,7 +5,7 @@ static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return apir_get_device_name(gpu); + return apir_device_get_name(gpu); } static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { @@ -13,7 +13,7 @@ static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return apir_get_device_description(gpu); + return apir_device_get_description(gpu); } static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { @@ -21,7 +21,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return (enum ggml_backend_dev_type) apir_get_device_type(gpu); + return (enum ggml_backend_dev_type) apir_device_get_type(gpu); } static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { @@ -29,15 +29,13 @@ static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; - return apir_get_device_memory(gpu, free, total); + return apir_device_get_memory(gpu, free, total); } static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { UNUSED(dev); UNUSED(op); - //NOT_IMPLEMENTED; // to chatty - return true; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 216c69ced375b..06bcb0310cbc6 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -32,7 +32,7 @@ static int ggml_backend_remoting_get_device_count() { return 0; } - return apir_get_device_count(gpu); + return apir_device_get_count(gpu); } static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { @@ -46,7 +46,7 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) return 0; } - return apir_get_device_count(gpu); + return apir_device_get_count(gpu); } static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp index 59739cb0ff30f..134ca8f58ad1a 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -5,13 +5,13 @@ // printf("INFO: ### found response in the cache %s\n", __func__) int -apir_get_device_count(struct virtgpu *gpu) { +apir_device_get_count(struct virtgpu *gpu) { static int32_t dev_count = -1; if (dev_count != -1) { CACHED; return dev_count; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_COUNT; + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_COUNT; struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); if (!encoder) { FATAL("%s: failed to prepare the remote call encoder :/", __func__); @@ -36,14 +36,14 @@ apir_get_device_count(struct virtgpu *gpu) { const char * -apir_get_device_name(struct virtgpu *gpu) { +apir_device_get_name(struct virtgpu *gpu) { static int32_t dev_count = -1; if (dev_count != -1) { CACHED; return "Nothing"; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_NAME; + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_NAME; struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); if (!encoder) { FATAL("%s: failed to prepare the remote call encoder :/", __func__); @@ -72,13 +72,13 @@ apir_get_device_name(struct virtgpu *gpu) { } const char * -apir_get_device_description(struct virtgpu *gpu) { +apir_device_get_description(struct virtgpu *gpu) { static int32_t dev_count = -1; if (dev_count != -1) { CACHED; return "Nothing"; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_DESCRIPTION; + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION; struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); if (!encoder) { FATAL("%s: failed to prepare the remote call encoder :/", __func__); @@ -107,13 +107,13 @@ apir_get_device_description(struct virtgpu *gpu) { } uint32_t -apir_get_device_type(struct virtgpu *gpu) { +apir_device_get_type(struct virtgpu *gpu) { static uint32_t dev_type = 255; if (dev_type != 255) { CACHED; return dev_type; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_TYPE; + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_TYPE; struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); if (!encoder) { @@ -138,7 +138,7 @@ apir_get_device_type(struct virtgpu *gpu) { } void -apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) { +apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { static size_t dev_free = 0; static size_t dev_total = 0; /* @@ -151,7 +151,7 @@ apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total) { return; } */ - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_GET_DEVICE_MEMORY; + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_MEMORY; struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); if (!encoder) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 13b523b2d3fbf..2edade8f289f1 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -1,5 +1,5 @@ -int apir_get_device_count(struct virtgpu *gpu); -const char *apir_get_device_name(struct virtgpu *gpu); -const char *apir_get_device_description(struct virtgpu *gpu); -uint32_t apir_get_device_type(struct virtgpu *gpu); -void apir_get_device_memory(struct virtgpu *gpu, size_t *free, size_t *total); +int apir_device_get_count(struct virtgpu *gpu); +const char *apir_device_get_name(struct virtgpu *gpu); +const char *apir_device_get_description(struct virtgpu *gpu); +uint32_t apir_device_get_type(struct virtgpu *gpu); +void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); From 1d9d44d9534d42d1adae6703469475d0f0aaf58e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 15:58:04 +0200 Subject: [PATCH 045/117] remoting: add support for device_supports_op --- .../backend-dispatched.cpp | 11 +++++++ .../ggml-remotingbackend/backend-dispatched.h | 4 +++ .../shared/apir_backend.h | 3 +- .../ggml-remotingbackend/shared/venus_cs.h | 26 ++++++++++++++++ .../ggml-backend-device.cpp | 7 +++-- .../ggml-remotingfrontend/virtgpu-forward.cpp | 31 +++++++++++++++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 3 ++ 7 files changed, 81 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index d00a015c99d61..91d8ac4bd6fc2 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -132,3 +132,14 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) return 0; } + +uint32_t +backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); + + bool supports_op = dev->iface.supports_op(dev, op); + + vn_encode_bool_t(enc, &supports_op); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 6026b9537a1e6..4974d5222ddb0 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -7,6 +7,7 @@ #include "backend-utils.h" #include "shared/venus_cs.h" +#include "shared/venus_cs_ggml.h" #include "shared/apir_backend.h" uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); @@ -20,6 +21,7 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { @@ -29,6 +31,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; default: return "unknown"; } } @@ -39,4 +42,5 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, + [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 4eb7816ce8ed0..6949aa5429ca3 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -29,6 +29,7 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, + APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, - APIR_BACKEND_DISPATCH_TABLE_COUNT = 5, // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 6, // last command_type index + 1 } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index bb9cc99b7262c..c41326eb93ef7 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -53,6 +53,18 @@ vn_cs_decoder_peek(const struct vn_cs_decoder *dec, vn_cs_decoder_peek_internal(dec, size, val, val_size); } +static inline const void * +vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec, + size_t size) +{ + if (unlikely(size > (size_t) (dec->end - dec->cur))) { + FATAL("READING TOO MUCH FROM THE DECODER :/"); + } + const void *addr = dec->cur; + dec->cur += size; + + return addr; +} /* * read/write */ @@ -426,3 +438,17 @@ vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count) struct vkr_cs_decoder *d = (struct vkr_cs_decoder *)dec; return vkr_cs_decoder_alloc_array(d, size, count); } + +/* bool */ + +static inline void +vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(int)); +} + +static inline void +vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(int)); +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 47227e63d97e4..bd3b5daee46d3 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -33,10 +33,11 @@ static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size } static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - UNUSED(dev); - UNUSED(op); + IMPLEMENTED; - return true; + struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + + return apir_device_supports_op(gpu, op); } static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp index 134ca8f58ad1a..dbb42ee75a008 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp @@ -1,5 +1,7 @@ +#include "ggml.h" #include "virtgpu.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" #define CACHED // printf("INFO: ### found response in the cache %s\n", __func__) @@ -179,3 +181,32 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { return; } + +bool +apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + vn_encode_ggml_tensor(encoder, op); + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + bool supports_op; + vn_decode_bool_t(decoder, &supports_op); + + /* *** */ + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return supports_op; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 2edade8f289f1..be1f783dd6c94 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -1,5 +1,8 @@ +struct ggml_tensor; + int apir_device_get_count(struct virtgpu *gpu); const char *apir_device_get_name(struct virtgpu *gpu); const char *apir_device_get_description(struct virtgpu *gpu); uint32_t apir_device_get_type(struct virtgpu *gpu); void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); +bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); From 4a687508691815b9c3924d41e0c682487353c220 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 15:58:25 +0200 Subject: [PATCH 046/117] ggml/src/ggml-remotingbackend/shared/venus_cs.h: clearer message when can't read from the decoder --- .../ggml-remotingbackend/shared/venus_cs.h | 2 +- .../shared/venus_cs_ggml.h | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index c41326eb93ef7..bf0439e6eee86 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -33,7 +33,7 @@ vn_cs_decoder_peek_internal(const struct vn_cs_decoder *dec, assert(val_size <= size); if (unlikely(size > (size_t) (dec->end - dec->cur))) { - FATAL("DECODER IS FULL :/"); + FATAL("READING TOO MUCH FROM THE DECODER :/"); //vn_cs_decoder_set_fatal(dec); memset(val, 0, val_size); return false; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h new file mode 100644 index 0000000000000..96f3bb2aa3346 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -0,0 +1,34 @@ +// needs the ggml.h definition +// needs venus_cs.h definition + +static inline void +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *op) { + size_t tensor_size = sizeof(*op); + + if (op->buffer || op->data || op->view_src || op->extra) { + FATAL("Cannot pass tensors with data"); + } + + vn_cs_encoder_write(enc, tensor_size, op, tensor_size); + + for (int i = 0; op->src[i]; i++) { + const ggml_tensor *src_op = op->src[i]; + vn_cs_encoder_write(enc, tensor_size, src_op, tensor_size); + } +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { + + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_tensor *op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + + + for (int i = 0; op->src[i]; i++) { + ggml_tensor *src_op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + op->src[i] = src_op; // overwrite op->src[i] pointer with the actual location of the src tensor + } + + return op; +} From 0b77fdeaa3afdb1981098ed832cf75b33ee692fd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 15:58:57 +0200 Subject: [PATCH 047/117] ggml/src/ggml-remotingfrontend/virtgpu.cpp: make less verbose --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 679d8fcae6fe6..58d70ddda28ff 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -410,10 +410,6 @@ remote_call_prepare( uint32_t reply_res_id = gpu->reply_shmem->res_id; vn_encode_uint32_t(&enc, &reply_res_id); - printf("%s: prepare %s(flags=0x%x, reply_buf=%d)\n", __func__, - api_remoting_command_name(cmd_type), - cmd_flags, reply_res_id); - return &enc; } From 8c81f0f91b1b313c438c91e8f61189bbc2c321df Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 13 May 2025 16:18:07 +0200 Subject: [PATCH 048/117] remoting: reindent and mark functions as NOT_IMPLEMENTED --- .../ggml-backend-buffer.cpp | 36 ++-- .../ggml-backend-device.cpp | 2 +- .../ggml-remotingfrontend/ggml-backend.cpp | 60 +++---- .../ggml-buffer-type.cpp | 158 +++++++++++------- .../ggml-host-buffer-type.cpp | 64 +++---- 5 files changed, 188 insertions(+), 132 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 638203252a86d..d4cd4e013f66c 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -3,24 +3,32 @@ #include "ggml-remoting.h" void ggml_remoting_destroy_buffer(remoting_buffer& buf) { - UNUSED(buf); + NOT_IMPLEMENTED; + + UNUSED(buf); } static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) { - UNUSED(dst); - UNUSED(offset); - UNUSED(src); - UNUSED(size); + NOT_IMPLEMENTED; + + UNUSED(dst); + UNUSED(offset); + UNUSED(src); + UNUSED(size); } static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) { - UNUSED(src); - UNUSED(offset); - UNUSED(dst); - UNUSED(size); + NOT_IMPLEMENTED; + + UNUSED(src); + UNUSED(offset); + UNUSED(dst); + UNUSED(size); } static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) { + NOT_IMPLEMENTED; + UNUSED(ctx); UNUSED(dst); UNUSED(dst_offset); @@ -32,8 +40,10 @@ static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buff static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) { - if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; - } - return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; + NOT_IMPLEMENTED; + + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index bd3b5daee46d3..283070079a5c9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -63,7 +63,7 @@ static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, cons static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { UNUSED(dev); - // NOT_IMPLEMENTED; // too chatty + IMPLEMENTED; return ggml_backend_remoting_host_buffer_type(); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index aac17a762ff9b..61161caa663bd 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -1,54 +1,54 @@ #include "ggml-remoting.h" static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { - UNUSED(backend); + UNUSED(backend); - return "API Remoting backend"; + return "API Remoting backend"; } static void ggml_backend_remoting_free(ggml_backend_t backend) { - UNUSED(backend); + UNUSED(backend); } static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - UNUSED(backend); - UNUSED(cgraph); + UNUSED(backend); + UNUSED(cgraph); - return GGML_STATUS_SUCCESS; + return GGML_STATUS_SUCCESS; } static ggml_backend_i ggml_backend_remoting_interface = { - /* .get_name = */ ggml_backend_remoting_get_name, - /* .free = */ ggml_backend_remoting_free, - /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, - /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, - /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, - /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_remoting_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, + /* .get_name = */ ggml_backend_remoting_get_name, + /* .free = */ ggml_backend_remoting_free, + /* .set_tensor_async = */ NULL, // ggml_backend_remoting_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_remoting_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_remoting_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_remoting_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_remoting_graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, }; static ggml_guid_t ggml_backend_remoting_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - return &guid; + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; } ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { - UNUSED(params); - ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; + UNUSED(params); + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; - ggml_backend_t remoting_backend = new ggml_backend { - /* .guid = */ ggml_backend_remoting_guid(), - /* .interface = */ ggml_backend_remoting_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device), - /* .context = */ ctx, - }; + ggml_backend_t remoting_backend = new ggml_backend { + /* .guid = */ ggml_backend_remoting_guid(), + /* .interface = */ ggml_backend_remoting_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), ctx->device), + /* .context = */ ctx, + }; - return remoting_backend; + return remoting_backend; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp index 3d882110b9962..ea0f72fd4dba5 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp @@ -3,79 +3,104 @@ extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; struct ggml_backend_remoting_buffer_type_context { - std::string name; + std::string name; }; static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) { - UNUSED(buft); + UNUSED(buft); - return "Remoting buffer"; + NOT_IMPLEMENTED; + + return "Remoting buffer"; } static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; + ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; + NEXT; + NOT_IMPLEMENTED; - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); } static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - return 4096; + UNUSED(buft); + + NEXT; + NOT_IMPLEMENTED; + + return 4096; } static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - return 40960; + UNUSED(buft); + + NEXT; + NOT_IMPLEMENTED; + + return 40960; } static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - UNUSED(buft); - UNUSED(tensor); - return ggml_nbytes(tensor); + UNUSED(buft); + UNUSED(tensor); + + NEXT; + NOT_IMPLEMENTED; + + return ggml_nbytes(tensor); } static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { - /* .get_name = */ ggml_backend_remoting_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, - /* .is_host = */ NULL, + /* .get_name = */ ggml_backend_remoting_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, + /* .is_host = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { - static struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_remoting_buffer_type_interface, - /* .device = */ dev, - /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, - }; + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, + }; - return & buft; + return & buft; } static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - ggml_remoting_destroy_buffer(ctx->dev_buffer); - delete ctx; + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + NOT_IMPLEMENTED; + + ggml_remoting_destroy_buffer(ctx->dev_buffer); + delete ctx; } static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - if (tensor->view_src != nullptr) { - GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - } - return GGML_STATUS_SUCCESS; + NEXT; + NOT_IMPLEMENTED; + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + } + return GGML_STATUS_SUCCESS; } static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *) 4096; + UNUSED(buffer); + + NEXT; + NOT_IMPLEMENTED; - UNUSED(buffer); + return (void *) 4096; } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + NOT_IMPLEMENTED; + UNUSED(buffer); UNUSED(tensor); UNUSED(value); @@ -85,38 +110,45 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + + NOT_IMPLEMENTED; + #if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - remoting_buffer buf = buf_ctx->dev_buffer; + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + remoting_buffer buf = buf_ctx->dev_buffer; - ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); + ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); #else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); #endif } static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + NOT_IMPLEMENTED; + #if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - remoting_buffer buf = buf_ctx->dev_buffer; + remoting_buffer buf = buf_ctx->dev_buffer; - ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); + ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); #else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); #endif } static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + NOT_IMPLEMENTED; + return true; UNUSED(buffer); @@ -125,6 +157,8 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer } static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + NOT_IMPLEMENTED; + UNUSED(dst); UNUSED(c); UNUSED(size); @@ -132,6 +166,8 @@ static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uin } static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + NOT_IMPLEMENTED; + UNUSED(ctx); UNUSED(dst); UNUSED(c); @@ -140,19 +176,21 @@ static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_bu } static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + NOT_IMPLEMENTED; + + ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); + ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); } ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { - /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, - /* .get_base = */ ggml_backend_remoting_buffer_get_base, - /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, - /* .clear = */ ggml_backend_remoting_buffer_clear, - /* .reset = */ NULL, + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp index fbf5569788c40..bcbd3fa57f156 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp @@ -3,53 +3,61 @@ // host buffer type static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_REMOTING_FRONTEND_NAME "_Host"; + UNUSED(buft); + + NOT_IMPLEMENTED; - UNUSED(buft); + return GGML_REMOTING_FRONTEND_NAME "_Host"; } static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + UNUSED(buffer); + NOT_IMPLEMENTED; + # if 0 - ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); + ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); #endif - UNUSED(buffer); } static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + UNUSED(buft); + + NOT_IMPLEMENTED; - void *ptr = nullptr; - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; + void *ptr = nullptr; + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; - return buffer; - UNUSED(buft); + return buffer; } static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { UNUSED(buft); + + NOT_IMPLEMENTED; return 4096; } // Should be changed to return device-specific host buffer type // but that probably requires changes in llama.cpp ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0), - /* .context = */ nullptr, - }; - - // Make sure device 0 is initialized - //ggml_remoting_instance_init(); - //ggml_remoting_get_device(0); - - return &ggml_backend_remoting_buffer_type_host; + static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0), + /* .context = */ nullptr, + }; + + // Make sure device 0 is initialized + //ggml_remoting_instance_init(); + //ggml_remoting_get_device(0); + + return &ggml_backend_remoting_buffer_type_host; } From 319af57a0cacf9bd581753f1172ad1b6acb6a207 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 14 May 2025 14:53:27 +0200 Subject: [PATCH 049/117] Add buffer-type support --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 2 + .../backend-dispatched-buffer-type.cpp | 57 ++++++++ .../backend-dispatched-device.cpp | 90 ++++++++++++ .../backend-dispatched.cpp | 81 ++--------- .../ggml-remotingbackend/backend-dispatched.h | 59 +++++--- .../ggml-remotingbackend/backend-internal.h | 16 +++ .../shared/apir_backend.h | 33 +++-- .../ggml-remotingbackend/shared/venus_cs.h | 14 ++ .../shared/venus_cs_ggml.h | 19 ++- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 3 +- .../ggml-backend-device.cpp | 57 ++++++-- .../ggml-remotingfrontend/ggml-backend.cpp | 9 ++ .../ggml-buffer-type.cpp | 85 +++++------ .../src/ggml-remotingfrontend/ggml-remoting.h | 8 ++ .../virtgpu-forward-buffer-type.cpp | 135 ++++++++++++++++++ ...forward.cpp => virtgpu-forward-device.cpp} | 30 +++- .../ggml-remotingfrontend/virtgpu-forward.h | 15 +- 17 files changed, 548 insertions(+), 165 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp rename ggml/src/ggml-remotingfrontend/{virtgpu-forward.cpp => virtgpu-forward-device.cpp} (88%) diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index 7435c7726beee..fb2504870e6d2 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -6,6 +6,8 @@ message(STATUS "Enable API Remoting backend") ggml_add_backend_library(ggml-remotingbackend backend.cpp backend-dispatched.cpp + backend-dispatched-device.cpp + backend-dispatched-buffer-type.cpp backend-utils.cpp shared/api_remoting.h shared/apir_backend.h diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp new file mode 100644 index 0000000000000..979448bd218ab --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -0,0 +1,57 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-remoting-backend.h" + +#include "ggml-metal.h" + +uint32_t +backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buft(dec); + + const char *string = buft->iface.get_name(buft); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buft(dec); + + size_t value = buft->iface.get_alignment(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buft(dec); + + size_t value = buft->iface.get_max_size(buft); + vn_encode_size_t(enc, &value); + + return 0; +} + +uint32_t +backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buft(dec); + + bool is_host = buft->iface.is_host(buft); + vn_encode_bool_t(enc, &is_host); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp new file mode 100644 index 0000000000000..627aa4685c773 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -0,0 +1,90 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-remoting-backend.h" + +#include "ggml-metal.h" + +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + int32_t dev_count = reg->iface.get_device_count(reg); + vn_encode_int32_t(enc, &dev_count); + + return 0; +} + +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + const char *string = dev->iface.get_name(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + const char *string = dev->iface.get_description(dev); + + const size_t string_size = strlen(string) + 1; + vn_encode_array_size(enc, string_size); + vn_encode_char_array(enc, string, string_size); + + return 0; +} + +uint32_t +backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + uint32_t type = dev->iface.get_type(dev); + vn_encode_uint32_t(enc, &type); + + return 0; +} + +uint32_t +backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + + vn_encode_size_t(enc, &free); + vn_encode_size_t(enc, &total); + + return 0; +} + +uint32_t +backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); + + bool supports_op = dev->iface.supports_op(dev, op); + + vn_encode_bool_t(enc, &supports_op); + + return 0; +} + +uint32_t +backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); + + apir_buffer_type_context_t bufft_ctx = (apir_buffer_type_context_t) bufft; + vn_encode_apir_buffer_type_context_t(enc, &bufft_ctx); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index 91d8ac4bd6fc2..bea07682256ac 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -9,9 +9,9 @@ #include "ggml-metal.h" -static ggml_backend_reg_t reg = NULL; -static ggml_backend_dev_t dev = NULL; -static ggml_backend_t bck = NULL; +ggml_backend_reg_t reg = NULL; +ggml_backend_dev_t dev = NULL; +ggml_backend_t bck = NULL; uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) { if (reg != NULL) { @@ -41,12 +41,17 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { UNUSED(reg); + + NOT_IMPLEMENTED; + return 0; } static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { UNUSED(reg); + NOT_IMPLEMENTED; + return GGML_REMOTING_BACKEND_NAME; } @@ -54,6 +59,8 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ UNUSED(reg); UNUSED(device); + NOT_IMPLEMENTED; + return NULL; } @@ -75,71 +82,3 @@ ggml_backend_reg_t ggml_backend_remoting_backend_reg() { return ® } - -uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - UNUSED(dec); - - int32_t dev_count = reg->iface.get_device_count(reg); - vn_encode_int32_t(enc, &dev_count); - - return 0; -} - -uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - UNUSED(dec); - - const char *string = dev->iface.get_name(dev); - - const size_t string_size = strlen(string) + 1; - vn_encode_array_size(enc, string_size); - vn_encode_char_array(enc, string, string_size); - - return 0; -} - -uint32_t -backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - UNUSED(dec); - - const char *string = dev->iface.get_description(dev); - - const size_t string_size = strlen(string) + 1; - vn_encode_array_size(enc, string_size); - vn_encode_char_array(enc, string, string_size); - - return 0; -} - -uint32_t -backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - UNUSED(dec); - - uint32_t type = dev->iface.get_type(dev); - vn_encode_uint32_t(enc, &type); - - return 0; -} - -uint32_t -backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - UNUSED(dec); - - size_t free, total; - dev->iface.get_memory(dev, &free, &total); - - vn_encode_size_t(enc, &free); - vn_encode_size_t(enc, &total); - - return 0; -} - -uint32_t -backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); - - bool supports_op = dev->iface.supports_op(dev, op); - - vn_encode_bool_t(enc, &supports_op); - - return 0; -} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 4974d5222ddb0..30e3dded013de 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -6,9 +6,10 @@ #include #include "backend-utils.h" +#include "shared/apir_backend.h" #include "shared/venus_cs.h" #include "shared/venus_cs_ggml.h" -#include "shared/apir_backend.h" + uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); @@ -17,30 +18,56 @@ typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_d /* *** */ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); + +/* device */ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); + +/* buffer-type */ +uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { - switch (type) { - case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count"; - case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name"; - case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; - case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; - case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; - case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; - default: return "unknown"; - } + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: return "backend_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: return "backend_get_device_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: return "backend_get_device_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: return "backend_device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type"; + + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host"; + + default: return "unknown"; + } } static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { - [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count, - [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name, - [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, - [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, - [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, - [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, + /* device */ + [APIR_COMMAND_TYPE_DEVICE_GET_COUNT] = backend_reg_get_device_count, + [APIR_COMMAND_TYPE_DEVICE_GET_NAME] = backend_device_get_name, + [APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION] = backend_device_get_description, + [APIR_COMMAND_TYPE_DEVICE_GET_TYPE] = backend_device_get_type, + [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, + [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, + [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type, + + /* buffer-type */ + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment, + [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size, + [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host, }; diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h index 8828f08aa1052..7fd803c2aa5dd 100644 --- a/ggml/src/ggml-remotingbackend/backend-internal.h +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -2,6 +2,22 @@ #include #include +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +extern ggml_backend_reg_t reg; +extern ggml_backend_dev_t dev; + +#define NOT_IMPLEMENTED \ + do { \ + static bool first = true; \ + if (first) { \ + printf("\nWARN: ###\nWARN: ### reached unimplemented function %s\nWARN: ###\n\n", __func__); \ + first = false; \ + } \ + } while(0) + extern "C" { uint32_t apir_backend_initialize(); void apir_backend_deinit(void); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 6949aa5429ca3..0917da7d0e4af 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,22 +14,33 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 +typedef void * apir_buffer_type_context_t; + typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, - char *dec_cur, const char *dec_end, - char *enc_cur, const char *enc_end, - char **enc_cur_after + char *dec_cur, const char *dec_end, + char *enc_cur, const char *enc_end, + char **enc_cur_after ); typedef enum ApirBackendCommandType { - APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, - APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1, - APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, - APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, - APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, - APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, - - APIR_BACKEND_DISPATCH_TABLE_COUNT = 6, // last command_type index + 1 + /* device */ + APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, + APIR_COMMAND_TYPE_DEVICE_GET_NAME = 1, + APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION = 2, + APIR_COMMAND_TYPE_DEVICE_GET_TYPE = 3, + APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, + APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, + APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6, + + /* buffer-type */ + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 7, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 8, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 9, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 10, + + // last command_type index + 1 + APIR_BACKEND_DISPATCH_TABLE_COUNT = 11, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index bf0439e6eee86..c796cd3f8e893 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -452,3 +452,17 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) { vn_decode(dec, sizeof(int), val, sizeof(int)); } + +/* apir_buffer_type_context_t */ + +static inline void +vn_encode_apir_buffer_type_context_t(struct vn_cs_encoder *enc, const apir_buffer_type_context_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t)); +} + +static inline void +vn_decode_apir_buffer_type_context_t(struct vn_cs_decoder *dec, apir_buffer_type_context_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t)); +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 96f3bb2aa3346..4302424aadce0 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -1,4 +1,4 @@ -// needs the ggml.h definition +// needs the ggml-backend-impl.h definition // needs venus_cs.h definition static inline void @@ -32,3 +32,20 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { return op; } + +static inline void +vn_encode_ggml_buft(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) { + size_t buft_ctx_size = sizeof(buft->context); + + vn_cs_encoder_write(enc, buft_ctx_size, &buft->context, buft_ctx_size); +} + +static inline ggml_backend_buffer_type_t +vn_decode_ggml_buft(struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + size_t buft_size = sizeof(buft); + + vn_cs_decoder_read(dec, buft_size, &buft, buft_size); + + return buft; +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index df45db51f46b3..accdbc473ecc7 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -13,7 +13,8 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu.cpp virtgpu-shm.cpp virtgpu-utils.cpp - virtgpu-forward.cpp + virtgpu-forward-device.cpp + virtgpu-forward-buffer-type.cpp ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 283070079a5c9..c0c98c8b8a511 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -1,46 +1,55 @@ #include "ggml-remoting.h" -static const char *ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { +#define DEV_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu + +static const char * +ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { IMPLEMENTED; - struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_get_name(gpu); } -static const char *ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { +static const char * +ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { IMPLEMENTED; - struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_get_description(gpu); } -static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { +static enum ggml_backend_dev_type +ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { IMPLEMENTED; - struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + struct virtgpu *gpu = DEV_TO_GPU(dev); return (enum ggml_backend_dev_type) apir_device_get_type(gpu); } -static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { +static void +ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { IMPLEMENTED; - struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_get_memory(gpu, free, total); } -static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool +ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { IMPLEMENTED; - struct virtgpu *gpu = ((struct ggml_backend_remoting_device_context *) dev->context)->gpu; + struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_supports_op(gpu, op); } -static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { +static bool +ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { UNUSED(dev); UNUSED(buft); @@ -49,7 +58,8 @@ static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, g return true; } -static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool +ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; NOT_IMPLEMENTED; @@ -60,7 +70,8 @@ static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, cons UNUSED(dev); } -static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { UNUSED(dev); IMPLEMENTED; @@ -69,9 +80,10 @@ static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_t } -static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - +static void +ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { IMPLEMENTED; + props->name = ggml_backend_remoting_device_get_name(dev); props->description = ggml_backend_remoting_device_get_description(dev); props->type = ggml_backend_remoting_device_get_type(dev); @@ -84,6 +96,21 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struc }; } +ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ ctx, + }; + + return &buft; +} + const struct ggml_backend_device_i ggml_backend_remoting_device_i = { /* .get_name = */ ggml_backend_remoting_device_get_name, /* .get_description = */ ggml_backend_remoting_device_get_description, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 61161caa663bd..6c2f2b947e10b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -3,11 +3,15 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); + NOT_IMPLEMENTED; + return "API Remoting backend"; } static void ggml_backend_remoting_free(ggml_backend_t backend) { UNUSED(backend); + + NOT_IMPLEMENTED; } static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { @@ -35,12 +39,17 @@ static ggml_backend_i ggml_backend_remoting_interface = { static ggml_guid_t ggml_backend_remoting_guid() { static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + + NOT_IMPLEMENTED; + return &guid; } ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params) { UNUSED(params); + IMPLEMENTED; + ggml_backend_remoting_device_context * ctx = (ggml_backend_remoting_device_context *)dev->context; ggml_backend_t remoting_backend = new ggml_backend { diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp index ea0f72fd4dba5..d34904abb1ef0 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp @@ -1,79 +1,70 @@ #include "ggml-remoting.h" -extern ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; - -struct ggml_backend_remoting_buffer_type_context { - std::string name; -}; +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; -static const char * ggml_backend_remoting_buffer_type_name(ggml_backend_buffer_type_t buft) { - UNUSED(buft); +static ggml_backend_buffer_t +ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + BEING_IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + UNUSED(gpu); + /* ... */ - NOT_IMPLEMENTED; + void *ctx = NULL; - return "Remoting buffer"; + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); } -static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_remoting_buffer_type_context * ctx = (ggml_backend_remoting_buffer_type_context *) buft->context; +static const char * +ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + BEING_IMPLEMENTED; - NEXT; - NOT_IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); + return apir_buffer_type_get_name(gpu, buft); } -static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - UNUSED(buft); +static size_t +ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + IMPLEMENTED; - NEXT; - NOT_IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); - return 4096; + return apir_buffer_type_get_alignment(gpu, buft); } -static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - - NEXT; - NOT_IMPLEMENTED; +static size_t +ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); - return 40960; + return apir_buffer_type_get_max_size(gpu, buft); } -static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - UNUSED(buft); - UNUSED(tensor); - - NEXT; - NOT_IMPLEMENTED; +static bool +ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); - return ggml_nbytes(tensor); + return apir_buffer_type_is_host(gpu, buft); } -static ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { - /* .get_name = */ ggml_backend_remoting_buffer_type_name, +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_remoting_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_remoting_buffer_type_get_alloc_size, - /* .is_host = */ NULL, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_remoting_buffer_type_is_host, }; -ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { - - static struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_remoting_buffer_type_interface, - /* .device = */ dev, - /* .context = */ new ggml_backend_remoting_buffer_type_context{ "device_name"}, - }; - - return & buft; -} +/****************************************************************************************/ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + NEXT; NOT_IMPLEMENTED; ggml_remoting_destroy_buffer(ctx->dev_buffer); @@ -183,7 +174,7 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); } -ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { +const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, /* .get_base = */ ggml_backend_remoting_buffer_get_base, /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 986caef3f407a..8ba40c0b7f7ad 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -19,6 +19,13 @@ } \ } while(0) +#define BEING_IMPLEMENTED \ + do { \ + printf("\nINFO: ###\nINFO: ### function being implemented: %s\nINFO: ###\n\n", __func__); \ + } while(0) + +#define NEXT + #define IMPLEMENTED // printf("INFO: ### reached implemented function %s\n", __func__) @@ -32,6 +39,7 @@ struct ggml_backend_remoting_device_context { struct virtgpu *gpu; }; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_i; ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp new file mode 100644 index 0000000000000..b8a42f7f621b9 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -0,0 +1,135 @@ +#include "ggml-backend-impl.h" +#include "virtgpu.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" + +#define CACHED +// printf("INFO: ### found response in the cache %s\n", __func__) + + + +// buffer_type_alloc_buffer +const char * +apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + vn_encode_ggml_buft(encoder, buft); + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + const size_t string_size = vn_decode_array_size_unchecked(decoder); + char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + if (!string) { + FATAL("%s: Could not allocate the device name buffer", __func__); + } + vn_decode_char_array(decoder, string, string_size); + + INFO("%s: Forward BUFT NAME --> %s", __func__, string); + + /* *** */ + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return string; +} + +size_t +apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + static int32_t dev_count = -1; + if (dev_count != -1) { + CACHED; + return dev_count; + } + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT; + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + vn_encode_ggml_buft(encoder, buft); + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + size_t alignment; + vn_decode_size_t(decoder, &alignment); + + INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return alignment; +} + +size_t +apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE; + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + vn_encode_ggml_buft(encoder, buft); + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + size_t max_size; + vn_decode_size_t(decoder, &max_size); + + INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size); + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return max_size; +} + +bool +apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + vn_encode_ggml_buft(encoder, buft); + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + bool is_host; + vn_decode_bool_t(decoder, &is_host); + + /* *** */ + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return is_host; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp similarity index 88% rename from ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp rename to ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index dbb42ee75a008..1dd303e8c96bf 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -1,4 +1,4 @@ -#include "ggml.h" +#include "ggml-backend-impl.h" #include "virtgpu.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" @@ -36,7 +36,6 @@ apir_device_get_count(struct virtgpu *gpu) { return dev_count; } - const char * apir_device_get_name(struct virtgpu *gpu) { static int32_t dev_count = -1; @@ -210,3 +209,30 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { return supports_op; } + +apir_buffer_type_context_t +apir_device_get_buffer_type(struct virtgpu *gpu) { + int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE; + + struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); + if (!encoder) { + FATAL("%s: failed to prepare the remote call encoder :/", __func__); + } + + struct vn_cs_decoder *decoder = remote_call(gpu, encoder); + if (!decoder) { + FATAL("%s: failed to kick the remote call :/", __func__); + } + + apir_buffer_type_context_t buffer_type_ctx; + vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx); + + /* *** */ + + int32_t ret = remote_call_finish(encoder, decoder); + if (ret != 0) { + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); + } + + return buffer_type_ctx; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index be1f783dd6c94..c484d7eeab8c1 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -1,8 +1,21 @@ -struct ggml_tensor; +#include "ggml.h" +#include "ggml-impl.h" +#include "ggml-alloc.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" + +/* device */ int apir_device_get_count(struct virtgpu *gpu); const char *apir_device_get_name(struct virtgpu *gpu); const char *apir_device_get_description(struct virtgpu *gpu); uint32_t apir_device_get_type(struct virtgpu *gpu); void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); +apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu); + +/* buffer-type */ +// buffer_type_alloc_buffer +const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); From 73ed5073b722e2fe49ad10532b9530ee8ba3cd03 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 14 May 2025 16:18:27 +0200 Subject: [PATCH 050/117] Keep working --- ggml/CMakeLists.txt | 3 +- ggml/include/ggml-remoting-backend.h | 16 -- ggml/src/ggml-backend-reg.cpp | 8 +- ggml/src/ggml-remotingbackend/CMakeLists.txt | 1 - .../backend-dispatched-buffer-type.cpp | 1 - .../backend-dispatched-device.cpp | 16 +- .../backend-dispatched.cpp | 45 ----- .../ggml-remotingbackend/backend-dispatched.h | 3 + ggml/src/ggml-remotingbackend/backend.cpp | 2 - .../shared/apir_backend.h | 11 +- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 2 +- .../ggml-backend-device.cpp | 47 +++-- .../ggml-buffer-type.cpp | 10 +- .../ggml-host-buffer-type.cpp | 63 ------- .../src/ggml-remotingfrontend/ggml-remoting.h | 3 + .../virtgpu-forward-buffer-type.cpp | 91 +++------ .../virtgpu-forward-device.cpp | 178 +++++++----------- .../virtgpu-forward-impl.h | 33 ++++ .../ggml-remotingfrontend/virtgpu-forward.h | 5 + 19 files changed, 194 insertions(+), 344 deletions(-) delete mode 100644 ggml/include/ggml-remoting-backend.h delete mode 100644 ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 9d7576c911635..cfbd1aca0536f 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -271,8 +271,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h - ggml/include/ggml-remoting-frontend.h - ggml/include/ggml-remoting-backend.h + include/ggml-remoting-frontend.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-remoting-backend.h b/ggml/include/ggml-remoting-backend.h deleted file mode 100644 index 25a9dc269c957..0000000000000 --- a/ggml/include/ggml-remoting-backend.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_REMOTING_BACKEND_NAME "RemotingBackend" - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_remoting_backend_reg(); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 7e6d4f8c36f67..4f003f0e743e4 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -49,10 +49,6 @@ #include "ggml-remoting-frontend.h" #endif -#ifdef GGML_USE_REMOTINGBACKEND -#include "ggml-remoting-backend.h" -#endif - #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -183,9 +179,7 @@ struct ggml_backend_registry { #ifdef GGML_USE_REMOTINGFRONTEND register_backend(ggml_backend_remoting_frontend_reg()); #endif -#ifdef GGML_USE_REMOTINGBACKEND - register_backend(ggml_backend_remoting_backend_reg()); -#endif + #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index fb2504870e6d2..17ca5e1f53a54 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -12,7 +12,6 @@ ggml_add_backend_library(ggml-remotingbackend shared/api_remoting.h shared/apir_backend.h shared/venus_cs.h - ../../include/ggml-remoting-backend.h ) target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 979448bd218ab..1d17a69f27056 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -5,7 +5,6 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-remoting-backend.h" #include "ggml-metal.h" diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 627aa4685c773..7062b061defbb 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -5,7 +5,6 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-remoting-backend.h" #include "ggml-metal.h" @@ -88,3 +87,18 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder * return 0; } + +uint32_t +backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(dec); + + struct ggml_backend_dev_props props; + dev->iface.get_props(dev, &props); + + vn_encode_bool_t(enc, &props.caps.async); + vn_encode_bool_t(enc, &props.caps.host_buffer); + vn_encode_bool_t(enc, &props.caps.buffer_from_host_ptr); + vn_encode_bool_t(enc, &props.caps.events); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index bea07682256ac..73be488e6c0f7 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -5,7 +5,6 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-remoting-backend.h" #include "ggml-metal.h" @@ -38,47 +37,3 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba return APIR_BACKEND_INITIALIZE_SUCCESSS; } - -static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { - UNUSED(reg); - - NOT_IMPLEMENTED; - - return 0; -} - -static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { - UNUSED(reg); - - NOT_IMPLEMENTED; - - return GGML_REMOTING_BACKEND_NAME; -} - -static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { - UNUSED(reg); - UNUSED(device); - - NOT_IMPLEMENTED; - - return NULL; -} - -static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { - /* .get_name = */ ggml_backend_remoting_reg_get_name, - /* .get_device_count = */ ggml_backend_remoting_reg_get_device_count, - /* .get_device = */ ggml_backend_remoting_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_remoting_backend_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_remoting_reg_i, - /* .context = */ nullptr, - }; - - INFO("%s, hello :wave:", __func__); - - return ® -} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 30e3dded013de..356742d3ba174 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -26,6 +26,7 @@ uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); /* buffer-type */ uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); @@ -44,6 +45,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: return "backend_get_device_memory"; case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props"; /* buffer-type */ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name"; @@ -64,6 +66,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_DEVICE_GET_MEMORY] = backend_device_get_memory, [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type, + [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props, /* buffer-type */ [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name, diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 9a97b97a71f7c..c32353586a10b 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -16,8 +16,6 @@ static void *backend_library_handle = NULL; - - extern "C" { void apir_backend_deinit(void) { if (backend_library_handle) { diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 0917da7d0e4af..abc20a981ca6b 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -34,13 +34,14 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_DEVICE_GET_MEMORY = 4, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6, + APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7, /* buffer-type */ - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 7, - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 8, - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 9, - APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 8, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 11, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 12, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index accdbc473ecc7..5410b80c86f43 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -9,12 +9,12 @@ ggml_add_backend_library(ggml-remotingfrontend ggml-backend-device.cpp ggml-backend-reg.cpp ggml-buffer-type.cpp - ggml-host-buffer-type.cpp virtgpu.cpp virtgpu-shm.cpp virtgpu-utils.cpp virtgpu-forward-device.cpp virtgpu-forward-buffer-type.cpp + virtgpu-forward-impl.h ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index c0c98c8b8a511..0d955014e0fcf 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -70,34 +70,33 @@ ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tenso UNUSED(dev); } -static ggml_backend_buffer_type_t -ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { - UNUSED(dev); - - IMPLEMENTED; - - return ggml_backend_remoting_host_buffer_type(); -} - - static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { IMPLEMENTED; + struct virtgpu *gpu = DEV_TO_GPU(dev); + props->name = ggml_backend_remoting_device_get_name(dev); props->description = ggml_backend_remoting_device_get_description(dev); props->type = ggml_backend_remoting_device_get_type(dev); ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* .async = */ false, - /* .host_buffer = */ true, - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; + + apir_device_get_props(gpu, + &props->caps.async, + &props->caps.host_buffer, + &props->caps.buffer_from_host_ptr, + &props->caps.events + ); + + INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d", + __func__, props->caps.async, props->caps.host_buffer, + props->caps.buffer_from_host_ptr, props->caps.events); } ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED; + struct virtgpu *gpu = DEV_TO_GPU(dev); apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu); @@ -111,6 +110,18 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { return &buft; } +static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + UNUSED(dev); + UNUSED(ptr); + UNUSED(size); + UNUSED(max_tensor_size); + + NOT_IMPLEMENTED; + STOP_HERE; + + return nullptr; +} + const struct ggml_backend_device_i ggml_backend_remoting_device_i = { /* .get_name = */ ggml_backend_remoting_device_get_name, /* .get_description = */ ggml_backend_remoting_device_get_description, @@ -119,8 +130,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_i = { /* .get_props = */ ggml_backend_remoting_device_get_props, /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, /* .offload_op = */ ggml_backend_remoting_device_offload_op, diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp index d34904abb1ef0..4882904759566 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp @@ -10,7 +10,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, BEING_IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); UNUSED(gpu); - /* ... */ void *ctx = NULL; @@ -19,7 +18,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - BEING_IMPLEMENTED; + IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); @@ -72,11 +71,12 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe } static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + UNUSED(buffer); + UNUSED(tensor); + NEXT; NOT_IMPLEMENTED; - if (tensor->view_src != nullptr) { - GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); - } + return GGML_STATUS_SUCCESS; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp deleted file mode 100644 index bcbd3fa57f156..0000000000000 --- a/ggml/src/ggml-remotingfrontend/ggml-host-buffer-type.cpp +++ /dev/null @@ -1,63 +0,0 @@ -#include "ggml-remoting.h" - -// host buffer type - -static const char * ggml_backend_remoting_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - - NOT_IMPLEMENTED; - - return GGML_REMOTING_FRONTEND_NAME "_Host"; -} - -static void ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - UNUSED(buffer); - NOT_IMPLEMENTED; - -# if 0 - ggml_remoting_host_free(remoting_instance.devices[0], buffer->context); -#endif -} - -static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - UNUSED(buft); - - NOT_IMPLEMENTED; - - void *ptr = nullptr; - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; - - return buffer; -} - -static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - UNUSED(buft); - - NOT_IMPLEMENTED; - return 4096; -} - -// Should be changed to return device-specific host buffer type -// but that probably requires changes in llama.cpp -ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_remoting_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_remoting_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_remoting_frontend_reg(), 0), - /* .context = */ nullptr, - }; - - // Make sure device 0 is initialized - //ggml_remoting_instance_init(); - //ggml_remoting_get_device(0); - - return &ggml_backend_remoting_buffer_type_host; -} diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 8ba40c0b7f7ad..2230622abf35b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -26,6 +26,9 @@ #define NEXT +#define STOP_HERE \ + thks_bye() + #define IMPLEMENTED // printf("INFO: ### reached implemented function %s\n", __func__) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index b8a42f7f621b9..4c2a7b6c4de75 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -1,29 +1,16 @@ -#include "ggml-backend-impl.h" -#include "virtgpu.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" - -#define CACHED -// printf("INFO: ### found response in the cache %s\n", __func__) - - +#include "virtgpu-forward-impl.h" // buffer_type_alloc_buffer const char * apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME; + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); vn_encode_ggml_buft(encoder, buft); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); const size_t string_size = vn_decode_array_size_unchecked(decoder); char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); @@ -36,100 +23,68 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) /* *** */ - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return string; } size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { - static int32_t dev_count = -1; - if (dev_count != -1) { - CACHED; - return dev_count; - } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); vn_encode_ggml_buft(encoder, buft); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); size_t alignment; vn_decode_size_t(decoder, &alignment); INFO("%s: Forward BUFT ALIGNMENT --> %zu ", __func__, alignment); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return alignment; } size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); vn_encode_ggml_buft(encoder, buft); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); size_t max_size; vn_decode_size_t(decoder, &max_size); INFO("%s: Forward BUFT MAX SIZE --> %zu ", __func__, max_size); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return max_size; } bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST; + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); vn_encode_ggml_buft(encoder, buft); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); bool is_host; vn_decode_bool_t(decoder, &is_host); - /* *** */ - - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return is_host; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 1dd303e8c96bf..d25081f0d1634 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -1,10 +1,4 @@ -#include "ggml-backend-impl.h" -#include "virtgpu.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" - -#define CACHED -// printf("INFO: ### found response in the cache %s\n", __func__) +#include "virtgpu-forward-impl.h" int apir_device_get_count(struct virtgpu *gpu) { @@ -13,50 +7,37 @@ apir_device_get_count(struct virtgpu *gpu) { CACHED; return dev_count; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_COUNT; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT); + REMOTE_CALL(gpu, encoder, decoder); vn_decode_int32_t(decoder, &dev_count); INFO("%s: Forward DEV COUNT --> %d ", __func__, dev_count); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return dev_count; } const char * apir_device_get_name(struct virtgpu *gpu) { - static int32_t dev_count = -1; - if (dev_count != -1) { + static char *string = nullptr; + if (string) { CACHED; - return "Nothing"; - } - - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_NAME; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); + return string; } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_NAME); + REMOTE_CALL(gpu, encoder, decoder); const size_t string_size = vn_decode_array_size_unchecked(decoder); - char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); + string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); if (!string) { FATAL("%s: Could not allocate the device name buffer", __func__); } @@ -64,31 +45,19 @@ apir_device_get_name(struct virtgpu *gpu) { INFO("%s: Forward DEV NAME --> %s", __func__, string); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return string; } const char * apir_device_get_description(struct virtgpu *gpu) { - static int32_t dev_count = -1; - if (dev_count != -1) { - CACHED; - return "Nothing"; - } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION); + + REMOTE_CALL(gpu, encoder, decoder); const size_t string_size = vn_decode_array_size_unchecked(decoder); char *string = (char *) vn_cs_decoder_alloc_array(decoder, sizeof(char), string_size); @@ -99,10 +68,7 @@ apir_device_get_description(struct virtgpu *gpu) { INFO("%s: Forward DEV DESCR --> %s", __func__, string); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return string; } @@ -114,26 +80,19 @@ apir_device_get_type(struct virtgpu *gpu) { CACHED; return dev_type; } - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_TYPE; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_TYPE); + + REMOTE_CALL(gpu, encoder, decoder); vn_decode_uint32_t(decoder, &dev_type); INFO("%s: Forward DEV TYPE --> %d ", __func__, dev_type); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return dev_type; } @@ -152,17 +111,12 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { return; } */ - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_MEMORY; + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_MEMORY); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); vn_decode_size_t(decoder, &dev_free); vn_decode_size_t(decoder, &dev_total); @@ -173,66 +127,72 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { INFO("%s: Forward DEV FREE mem --> %zu MB", __func__, dev_free / 1024 / 1024); INFO("%s: Forward DEV TOTAL mem --> %zu MB", __func__, dev_total / 1024 / 1024); - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + + REMOTE_CALL_FINISH(gpu, encoder, decoder); return; } bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP; + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); vn_encode_ggml_tensor(encoder, op); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + + REMOTE_CALL(gpu, encoder, decoder); bool supports_op; vn_decode_bool_t(decoder, &supports_op); /* *** */ - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return supports_op; } apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu) { - int32_t forward_flag = (int32_t) APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE; + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; - struct vn_cs_encoder *encoder = remote_call_prepare(gpu, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); - if (!encoder) { - FATAL("%s: failed to prepare the remote call encoder :/", __func__); - } + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE); - struct vn_cs_decoder *decoder = remote_call(gpu, encoder); - if (!decoder) { - FATAL("%s: failed to kick the remote call :/", __func__); - } + REMOTE_CALL(gpu, encoder, decoder); apir_buffer_type_context_t buffer_type_ctx; vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx); /* *** */ - - int32_t ret = remote_call_finish(encoder, decoder); - if (ret != 0) { - FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); - } + REMOTE_CALL_FINISH(gpu, encoder, decoder); return buffer_type_ctx; } + +void +apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_PROPS); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_bool_t(decoder, async); + vn_decode_bool_t(decoder, host_buffer); + vn_decode_bool_t(decoder, buffer_from_host_ptr); + vn_decode_bool_t(decoder, events); + + /* *** */ + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h new file mode 100644 index 0000000000000..4f9af992d70c9 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h @@ -0,0 +1,33 @@ +#include "ggml-backend-impl.h" +#include "virtgpu.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" +#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" + +#define CACHED +// printf("INFO: ### found response in the cache %s\n", __func__)o + + +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + do { \ + int32_t forward_flag = (int32_t) apir_command_type__; \ + encoder_name = remote_call_prepare(gpu_dev_name, VIRGL_APIR_COMMAND_TYPE_Forward, forward_flag); \ + if (!encoder) { \ + FATAL("%s: failed to prepare the remote call encoder :/", __func__); \ + } \ + } while(0) + +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name) \ + do { \ + decoder_name = remote_call(gpu_dev_name, encoder_name); \ + if (!decoder) { \ + FATAL("%s: failed to kick the remote call :/", __func__); \ + } \ + } while(0) + +#define REMOTE_CALL_FINISH(gpu_dev_name, encoder_name, decoder_name) \ + do { \ + int32_t ret = remote_call_finish(encoder_name, decoder_name); \ + if (ret != 0) { \ + FATAL("%s: failed to forward the API call (code=%d):/", __func__, ret); \ + } \ + } while(0) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index c484d7eeab8c1..5a9b3c15c82ba 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -12,6 +12,11 @@ uint32_t apir_device_get_type(struct virtgpu *gpu); void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu); +void apir_device_get_props(struct virtgpu *gpu, + bool *async, + bool *host_buffer, + bool *buffer_from_host_ptr, + bool *events); /* buffer-type */ // buffer_type_alloc_buffer From 88e8ec3d9562c717ae2d4546f227ca0b6a73d88f Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 15 May 2025 10:10:11 +0200 Subject: [PATCH 051/117] Keep working on buffer types and buffers --- .../shared/apir_backend.h | 7 ++- .../ggml-remotingbackend/shared/venus_cs.h | 10 ++-- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 3 +- ...-type.cpp => ggml-backend-buffer-type.cpp} | 18 ++++-- .../ggml-backend-device.cpp | 32 +++++++++-- .../ggml-backend-host-buffer-type.cpp | 56 +++++++++++++++++++ .../ggml-backend-reg.cpp | 2 +- .../src/ggml-remotingfrontend/ggml-remoting.h | 3 +- .../virtgpu-forward-buffer-type.cpp | 15 +++++ .../virtgpu-forward-device.cpp | 9 ++- .../ggml-remotingfrontend/virtgpu-forward.h | 5 +- 11 files changed, 131 insertions(+), 29 deletions(-) rename ggml/src/ggml-remotingfrontend/{ggml-buffer-type.cpp => ggml-backend-buffer-type.cpp} (92%) create mode 100644 ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index abc20a981ca6b..644fae7938379 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,7 +14,8 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 -typedef void * apir_buffer_type_context_t; +typedef uintptr_t apir_buffer_type_handle_t; +typedef uintptr_t apir_buffer_handle_t; typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); @@ -41,7 +42,9 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11, + APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12, + APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 12, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 14, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index c796cd3f8e893..bc9048f44e315 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -453,16 +453,16 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) vn_decode(dec, sizeof(int), val, sizeof(int)); } -/* apir_buffer_type_context_t */ +/* apir_buffer_type_handle_t */ static inline void -vn_encode_apir_buffer_type_context_t(struct vn_cs_encoder *enc, const apir_buffer_type_context_t *val) +vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_handle_t *val) { - vn_encode(enc, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t)); + vn_encode(enc, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); } static inline void -vn_decode_apir_buffer_type_context_t(struct vn_cs_decoder *dec, apir_buffer_type_context_t *val) +vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_handle_t *val) { - vn_decode(dec, sizeof(apir_buffer_type_context_t), val, sizeof(apir_buffer_type_context_t)); + vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); } diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 5410b80c86f43..a2b3277584b38 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -8,7 +8,8 @@ ggml_add_backend_library(ggml-remotingfrontend ggml-backend.cpp ggml-backend-device.cpp ggml-backend-reg.cpp - ggml-buffer-type.cpp + ggml-backend-buffer-type.cpp + ggml-backend-host-buffer-type.cpp virtgpu.cpp virtgpu-shm.cpp virtgpu-utils.cpp diff --git a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp similarity index 92% rename from ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp rename to ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 4882904759566..22f962ec27579 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -3,6 +3,9 @@ #define BUFT_TO_GPU(name) \ ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu +#define BUFFER_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->dev->context)->gpu + extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; static ggml_backend_buffer_t @@ -11,9 +14,9 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, struct virtgpu *gpu = BUFT_TO_GPU(buft); UNUSED(gpu); - void *ctx = NULL; + apir_buffer_handle_t handle = apir_buffer_type_alloc_buffer(gpu, size); - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) handle, size); } static const char * @@ -76,17 +79,20 @@ static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_bu NEXT; NOT_IMPLEMENTED; - + STOP_HERE; return GGML_STATUS_SUCCESS; } static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { UNUSED(buffer); + BEING_IMPLEMENTED; + + STOP_HERE; + return NULL; + //struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - NEXT; - NOT_IMPLEMENTED; - return (void *) 4096; + //return apir_buffer_get_base(gpu, (ggml_backend_buffer_t)buffer->context); } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 0d955014e0fcf..9a72139b4d2ed 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -88,7 +88,12 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe &props->caps.events ); - INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d", + // ignore the actual backend answers and set it as we provide it in + // the API Remoting frontend + props->caps.host_buffer = true; + props->caps.buffer_from_host_ptr = false; + + INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", __func__, props->caps.async, props->caps.host_buffer, props->caps.buffer_from_host_ptr, props->caps.events); } @@ -99,12 +104,12 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { struct virtgpu *gpu = DEV_TO_GPU(dev); - apir_buffer_type_context_t ctx = apir_device_get_buffer_type(gpu); + apir_buffer_type_handle_t ctx = apir_device_get_buffer_type(gpu); static struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_remoting_buffer_type_interface, /* .device = */ dev, - /* .context = */ ctx, + /* .context = */ (void *) ctx, }; return &buft; @@ -122,7 +127,22 @@ static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_b return nullptr; } -const struct ggml_backend_device_i ggml_backend_remoting_device_i = { +static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + + static struct ggml_backend_buffer_type host_bufft = { + /* .iface = */ ggml_backend_remoting_host_buffer_type_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; + + // Make sure device 0 is initialized + //ggml_remoting_instance_init(); + //ggml_remoting_get_device(0); + + return &host_bufft; +} + +const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { /* .get_name = */ ggml_backend_remoting_device_get_name, /* .get_description = */ ggml_backend_remoting_device_get_description, /* .get_memory = */ ggml_backend_remoting_device_get_memory, @@ -130,8 +150,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_i = { /* .get_props = */ ggml_backend_remoting_device_get_props, /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, + /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, /* .offload_op = */ ggml_backend_remoting_device_offload_op, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp new file mode 100644 index 0000000000000..3aef4b86e2b6a --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -0,0 +1,56 @@ +#include "ggml-remoting.h" + +#define BUFT_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu + +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; + +static ggml_backend_buffer_t +ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + BEING_IMPLEMENTED; + struct virtgpu *gpu = BUFT_TO_GPU(buft); + UNUSED(gpu); + + void *ctx = NULL; + + NOT_IMPLEMENTED; + + STOP_HERE; + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); +} + +static const char * +ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED; + + return "GUEST host buffer"; +} + +static size_t +ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + NOT_IMPLEMENTED; + + return 4096; +} + +static bool +ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + NOT_IMPLEMENTED; + + return true; +} + +const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_remoting_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 06bcb0310cbc6..eeac6c59db670 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -77,7 +77,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ ctx->gpu = gpu; devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_remoting_device_i, + /* .iface = */ ggml_backend_remoting_device_interface, /* .reg = */ reg, /* .context = */ ctx, }); diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 2230622abf35b..ecdfcc1f31384 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -43,7 +43,8 @@ struct ggml_backend_remoting_device_context { }; extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; -extern const struct ggml_backend_device_i ggml_backend_remoting_device_i; +extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index 4c2a7b6c4de75..39c205edacef0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -88,3 +88,18 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { return is_host; } + +apir_buffer_handle_t +apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size) { + UNUSED(gpu); + UNUSED(size); + + return 0; +} + +void * +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { + UNUSED(gpu); + UNUSED(buffer_handle); + return NULL; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index d25081f0d1634..7c241d71a1679 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -155,7 +155,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { return supports_op; } -apir_buffer_type_context_t +apir_buffer_type_handle_t apir_device_get_buffer_type(struct virtgpu *gpu) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; @@ -164,13 +164,12 @@ apir_device_get_buffer_type(struct virtgpu *gpu) { REMOTE_CALL(gpu, encoder, decoder); - apir_buffer_type_context_t buffer_type_ctx; - vn_decode_apir_buffer_type_context_t(decoder, &buffer_type_ctx); + apir_buffer_type_handle_t buft_handle; + vn_decode_apir_buffer_type_handle_t(decoder, &buft_handle); - /* *** */ REMOTE_CALL_FINISH(gpu, encoder, decoder); - return buffer_type_ctx; + return buft_handle; } void diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 5a9b3c15c82ba..521029c3bee9e 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -11,7 +11,7 @@ const char *apir_device_get_description(struct virtgpu *gpu); uint32_t apir_device_get_type(struct virtgpu *gpu); void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); -apir_buffer_type_context_t apir_device_get_buffer_type(struct virtgpu *gpu); +apir_buffer_type_handle_t apir_device_get_buffer_type(struct virtgpu *gpu); void apir_device_get_props(struct virtgpu *gpu, bool *async, bool *host_buffer, @@ -19,8 +19,9 @@ void apir_device_get_props(struct virtgpu *gpu, bool *events); /* buffer-type */ -// buffer_type_alloc_buffer const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); +apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size); +void *apir_buffer_get_base(struct virtgpu *gpu, ggml_backend_buffer_t buffer); From 43af3a093de35e33daa2ea51fdfdd80e64f2b604 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 15 May 2025 14:11:59 +0200 Subject: [PATCH 052/117] implemnt alloc_buffer and get_base --- .../backend-dispatched-buffer-type.cpp | 28 +++++++++ .../backend-dispatched-device.cpp | 4 +- .../ggml-remotingbackend/backend-dispatched.h | 11 ++++ .../ggml-remotingbackend/shared/venus_cs.h | 28 +++++++++ .../shared/venus_cs_ggml.h | 41 ++++++++++--- .../ggml-backend-buffer-type.cpp | 32 +++++----- .../src/ggml-remotingfrontend/ggml-remoting.h | 23 ++----- .../virtgpu-forward-buffer-type.cpp | 60 +++++++++++++++---- .../ggml-remotingfrontend/virtgpu-forward.h | 4 +- 9 files changed, 175 insertions(+), 56 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 1d17a69f27056..cceec68064742 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -54,3 +54,31 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec return 0; } + +uint32_t +backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_type_t buft; + buft = vn_decode_ggml_buft(dec); + + size_t size; + vn_decode_size_t(dec, &size); + + ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size); + apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer; + vn_encode_ggml_buffer_handle(enc, buffer_handle); + + return 0; +} + +uint32_t +backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); + vn_encode_uintptr_t(enc, &base); + + INFO("%s: send base %p\n", __func__, (void *) base); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 7062b061defbb..2db2e75816258 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -82,8 +82,8 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder * ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); - apir_buffer_type_context_t bufft_ctx = (apir_buffer_type_context_t) bufft; - vn_encode_apir_buffer_type_context_t(enc, &bufft_ctx); + apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) bufft; + vn_encode_apir_buffer_type_handle_t(enc, &buft_handle); return 0; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 356742d3ba174..26e2762bf72b5 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -33,6 +33,10 @@ uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_de uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); + +/* buffer */ +uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { @@ -52,7 +56,10 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: return "backend_buffer_type_get_alignment"; case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size"; case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: return "backend_buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer"; + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; default: return "unknown"; } } @@ -73,4 +80,8 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT] = backend_buffer_type_get_alignment, [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE] = backend_buffer_type_get_max_size, [APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST] = backend_buffer_type_is_host, + [APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER] = backend_buffer_type_alloc_buffer, + + /* buffer */ + [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base, }; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index bc9048f44e315..d2b85c8f82196 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -466,3 +466,31 @@ vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_ { vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); } + +/* apir_buffer_handle_t */ + +static inline void +vn_encode_apir_buffer_handle_t(struct vn_cs_encoder *enc, const apir_buffer_handle_t *val) +{ + vn_encode(enc, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t)); +} + +static inline void +vn_decode_apir_buffer_handle_t(struct vn_cs_decoder *dec, apir_buffer_handle_t *val) +{ + vn_decode(dec, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t)); +} + +/* uintptr_t */ + +static inline void +vn_encode_uintptr_t(struct vn_cs_encoder *enc, const uintptr_t *val) +{ + vn_encode(enc, sizeof(*val), val, sizeof(*val)); +} + +static inline void +vn_decode_uintptr_t(struct vn_cs_decoder *dec, uintptr_t *val) +{ + vn_decode(dec, sizeof(*val), val, sizeof(*val)); +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 4302424aadce0..a587cad3b23bf 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -33,19 +33,44 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { return op; } -static inline void -vn_encode_ggml_buft(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) { - size_t buft_ctx_size = sizeof(buft->context); +/* *** ggml_backend_buffer_type_t *** */ + +// ggml_backend_buffer_type_t is a POINTER (to a struct). +// Only the host pointer is shared between the host and guest. +// The guest stores it in `buft->context`. +// The host simply writes the pointer address in the buffer variable. - vn_cs_encoder_write(enc, buft_ctx_size, &buft->context, buft_ctx_size); + +static inline void +vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, apir_buffer_type_handle_t *handle) { + vn_cs_encoder_write(enc, sizeof(*handle), handle, sizeof(*handle)); } static inline ggml_backend_buffer_type_t vn_decode_ggml_buft(struct vn_cs_decoder *dec) { - ggml_backend_buffer_type_t buft; - size_t buft_size = sizeof(buft); + apir_buffer_type_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return (ggml_backend_buffer_type_t) handle; +} + +/* *** ggml_backend_type_t *** */ + +// ggml_backend_buffer_t is a POINTER. +// same logic as for ggml_backend_buffer_type_t + +static inline void +vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle) { + vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +} + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec) { + ggml_backend_buffer_t buffer; + size_t buffer_ptr_size = sizeof(buffer); - vn_cs_decoder_read(dec, buft_size, &buft, buft_size); + vn_cs_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); - return buft; + return buffer; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 22f962ec27579..bc22310d277bf 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -4,19 +4,24 @@ ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu #define BUFFER_TO_GPU(name) \ - ((struct ggml_backend_remoting_device_context *) (name)->dev->context)->gpu + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - BEING_IMPLEMENTED; + IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); - UNUSED(gpu); - apir_buffer_handle_t handle = apir_buffer_type_alloc_buffer(gpu, size); + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size); - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) handle, size); + return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); } static const char * @@ -69,7 +74,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe NEXT; NOT_IMPLEMENTED; - ggml_remoting_destroy_buffer(ctx->dev_buffer); + //ggml_remoting_destroy_buffer(ctx->dev_buffer); delete ctx; } @@ -85,14 +90,11 @@ static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_bu static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { UNUSED(buffer); - BEING_IMPLEMENTED; - - STOP_HERE; - return NULL; - //struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + IMPLEMENTED; + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - //return apir_buffer_get_base(gpu, (ggml_backend_buffer_t)buffer->context); + return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle); } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -175,9 +177,11 @@ static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_bu static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { NOT_IMPLEMENTED; - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; + UNUSED(buffer); + UNUSED(value); + //ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); + //ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index ecdfcc1f31384..49ab2f34e0530 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -42,6 +42,12 @@ struct ggml_backend_remoting_device_context { struct virtgpu *gpu; }; +struct ggml_backend_remoting_buffer_context { + apir_buffer_handle_t handle; + + struct virtgpu *gpu; +}; + extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; @@ -61,23 +67,6 @@ struct remoting_device_struct; typedef std::shared_ptr remoting_device; typedef std::weak_ptr remoting_device_ref; -struct ggml_backend_remoting_buffer_context { - remoting_device_ref device; - remoting_buffer dev_buffer; - std::string name; - - ggml_backend_remoting_buffer_context(remoting_device_ref device, remoting_buffer&& dev_buffer, std::string& name) : - name(name) { - UNUSED(device); - UNUSED(dev_buffer); - } - - ~ggml_backend_remoting_buffer_context() { - ggml_remoting_destroy_buffer(dev_buffer); - } -}; - - struct remoting_context_struct { int i; }; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index 39c205edacef0..f072f0cac81a7 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -1,6 +1,5 @@ #include "virtgpu-forward-impl.h" -// buffer_type_alloc_buffer const char * apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { struct vn_cs_encoder *encoder; @@ -8,7 +7,8 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); - vn_encode_ggml_buft(encoder, buft); + apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -35,7 +35,8 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); - vn_encode_ggml_buft(encoder, buft); + apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -56,7 +57,8 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); - vn_encode_ggml_buft(encoder, buft); + apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -77,7 +79,8 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); - vn_encode_ggml_buft(encoder, buft); + apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -90,16 +93,47 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { } apir_buffer_handle_t -apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size) { - UNUSED(gpu); - UNUSED(size); +apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + INFO("%s: allocate device memory (%lu)\n", __func__, size); + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); + + apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_handle_t(encoder, &buft_handle); + + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + apir_buffer_handle_t buffer_handle; + vn_decode_apir_buffer_handle_t(decoder, &buffer_handle); + INFO("%s: received buffer handle %p\n", __func__, (void *) buffer_handle); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); - return 0; + return buffer_handle; } void * -apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { - UNUSED(gpu); - UNUSED(buffer_handle); - return NULL; +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); + + vn_encode_apir_buffer_handle_t(encoder, &handle); + + REMOTE_CALL(gpu, encoder, decoder); + + uintptr_t base; + vn_decode_uintptr_t(decoder, &base); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + INFO("%s: received base %p\n", __func__, (void *) base); + + return (void *) base; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 521029c3bee9e..dda345d27c574 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -23,5 +23,5 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); -apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, size_t size); -void *apir_buffer_get_base(struct virtgpu *gpu, ggml_backend_buffer_t buffer); +apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size); +void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle); From 25f8d24d7c6e138382193994dd65a7d170c3851c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 15 May 2025 14:29:16 +0200 Subject: [PATCH 053/117] buffer: clean ups --- .../ggml-backend-buffer-type.cpp | 126 ------------------ .../ggml-backend-buffer.cpp | 99 +++++++++++--- .../src/ggml-remotingfrontend/ggml-remoting.h | 1 + 3 files changed, 81 insertions(+), 145 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index bc22310d277bf..3a3d445958504 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -3,11 +3,6 @@ #define BUFT_TO_GPU(name) \ ((struct ggml_backend_remoting_device_context *) (name)->device->context)->gpu -#define BUFFER_TO_GPU(name) \ - ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu - -extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; - static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { IMPLEMENTED; @@ -73,125 +68,4 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; NEXT; NOT_IMPLEMENTED; - - //ggml_remoting_destroy_buffer(ctx->dev_buffer); - delete ctx; -} - -static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - UNUSED(buffer); - UNUSED(tensor); - - NEXT; - NOT_IMPLEMENTED; - STOP_HERE; - return GGML_STATUS_SUCCESS; -} - -static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { - UNUSED(buffer); - IMPLEMENTED; - - struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - - return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle); -} - -static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - NOT_IMPLEMENTED; - - UNUSED(buffer); - UNUSED(tensor); - UNUSED(value); - UNUSED(offset); - UNUSED(size); -} - - -static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - - NOT_IMPLEMENTED; - -#if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - remoting_buffer buf = buf_ctx->dev_buffer; - - ggml_remoting_buffer_write(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -#else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); -#endif -} - -static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - NOT_IMPLEMENTED; - -#if 0 - ggml_backend_remoting_buffer_context * buf_ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - - remoting_buffer buf = buf_ctx->dev_buffer; - - ggml_remoting_buffer_read(buf, remoting_tensor_offset(tensor) + tensor->view_offs + offset, data, size); -#else - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); -#endif } - - -static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { - NOT_IMPLEMENTED; - - return true; - - UNUSED(buffer); - UNUSED(src); - UNUSED(dst); -} - -static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - NOT_IMPLEMENTED; - - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - -static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - NOT_IMPLEMENTED; - - UNUSED(ctx); - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - -static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - NOT_IMPLEMENTED; - - UNUSED(buffer); - UNUSED(value); - //ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - - //ggml_remoting_buffer_memset(ctx->dev_buffer, 0, value, buffer->size); -} - -const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { - /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, - /* .get_base = */ ggml_backend_remoting_buffer_get_base, - /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, - /* .clear = */ ggml_backend_remoting_buffer_clear, - /* .reset = */ NULL, -}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index d4cd4e013f66c..25e4ed47c29a0 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -1,49 +1,110 @@ -#include - #include "ggml-remoting.h" -void ggml_remoting_destroy_buffer(remoting_buffer& buf) { +#define BUFFER_TO_GPU(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu + +static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + UNUSED(buffer); + UNUSED(tensor); + + NEXT; NOT_IMPLEMENTED; + STOP_HERE; + return GGML_STATUS_SUCCESS; +} + +static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { + UNUSED(buffer); + IMPLEMENTED; - UNUSED(buf); + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle); } -static void ggml_remoting_buffer_write(remoting_buffer& dst, size_t offset, const void * src, size_t size) { +static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { NOT_IMPLEMENTED; - UNUSED(dst); + UNUSED(buffer); + UNUSED(tensor); + UNUSED(value); UNUSED(offset); - UNUSED(src); UNUSED(size); } -static void ggml_remoting_buffer_read(remoting_buffer& src, size_t offset, void * dst, size_t size) { + +static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + NOT_IMPLEMENTED; - UNUSED(src); + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); UNUSED(offset); + UNUSED(size); +} + +static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + NOT_IMPLEMENTED; + + UNUSED(buffer); + UNUSED(tensor); + UNUSED(data); + UNUSED(offset); + UNUSED(size); +} + + +static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + NOT_IMPLEMENTED; + + return true; + + UNUSED(buffer); + UNUSED(src); + UNUSED(dst); +} + +static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { + NOT_IMPLEMENTED; + UNUSED(dst); + UNUSED(c); UNUSED(size); + UNUSED(offset); } -static void ggml_remoting_buffer_copy_async(remoting_context& ctx, remoting_buffer& dst, size_t dst_offset, remoting_buffer& src, size_t src_offset, size_t size) { +static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { NOT_IMPLEMENTED; UNUSED(ctx); UNUSED(dst); - UNUSED(dst_offset); - UNUSED(src); - UNUSED(src_offset); + UNUSED(c); UNUSED(size); + UNUSED(offset); } -static void * const remoting_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT +static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + UNUSED(buffer); + UNUSED(value); -static uint64_t remoting_tensor_offset(const ggml_tensor * tensor) { NOT_IMPLEMENTED; +} + +static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { + UNUSED(buffer); - if (tensor->view_src) { - return (uint8_t *) tensor->view_src->data - (uint8_t *) remoting_ptr_base; - } - return (uint8_t *) tensor->data - (uint8_t *) remoting_ptr_base; + NOT_IMPLEMENTED; } + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 49ab2f34e0530..8072c0e356d48 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -51,6 +51,7 @@ struct ggml_backend_remoting_buffer_context { extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); From db107bb35fff93df18814a87a6f7b718873303e2 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 15 May 2025 16:58:25 +0200 Subject: [PATCH 054/117] Keep working on buffers --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 1 + .../ggml-remotingbackend/backend-convert.h | 7 ++ .../backend-dispatched-buffer-type.cpp | 15 ---- .../backend-dispatched-buffer.cpp | 20 ++++++ .../backend-dispatched-device.cpp | 2 - .../ggml-remotingbackend/backend-dispatched.h | 3 +- .../shared/apir_backend.h | 2 + .../shared/venus_cs_ggml.h | 69 +++++++++++++++---- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 1 + .../ggml-backend-buffer-type.cpp | 6 -- .../ggml-backend-buffer.cpp | 20 ++---- .../ggml-backend-device.cpp | 6 +- .../ggml-backend-host-buffer-type.cpp | 18 ++++- .../src/ggml-remotingfrontend/ggml-remoting.h | 15 +++- .../virtgpu-forward-buffer-type.cpp | 23 +------ .../virtgpu-forward-buffer.cpp | 22 ++++++ .../virtgpu-forward-impl.h | 1 + .../ggml-remotingfrontend/virtgpu-forward.h | 8 ++- 18 files changed, 155 insertions(+), 84 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/backend-convert.h create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index 17ca5e1f53a54..feca344c90a64 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -7,6 +7,7 @@ ggml_add_backend_library(ggml-remotingbackend backend.cpp backend-dispatched.cpp backend-dispatched-device.cpp + backend-dispatched-buffer.cpp backend-dispatched-buffer-type.cpp backend-utils.cpp shared/api_remoting.h diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h new file mode 100644 index 0000000000000..e7d875cde7ee8 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-convert.h @@ -0,0 +1,7 @@ +#include "shared/apir_backend.h" + +static inline apir_buffer_handle_t +ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_handle_t) buffer; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index cceec68064742..da8a50d67ccb7 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -6,8 +6,6 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-metal.h" - uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { ggml_backend_buffer_type_t buft; @@ -69,16 +67,3 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder return 0; } - -uint32_t -backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { - ggml_backend_buffer_t buffer; - buffer = vn_decode_ggml_buffer(dec); - - uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); - vn_encode_uintptr_t(enc, &base); - - INFO("%s: send base %p\n", __func__, (void *) base); - - return 0; -} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp new file mode 100644 index 0000000000000..095a95f1a6fae --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -0,0 +1,20 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t +backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); + vn_encode_uintptr_t(enc, &base); + + //INFO("%s: send base %p\n", __func__, (void *) base); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 2db2e75816258..21b603a3160b6 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -6,8 +6,6 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-metal.h" - uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { UNUSED(dec); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 26e2762bf72b5..460d3a1af4e05 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -6,11 +6,11 @@ #include #include "backend-utils.h" +#include "backend-convert.h" #include "shared/apir_backend.h" #include "shared/venus_cs.h" #include "shared/venus_cs_ggml.h" - uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); @@ -60,6 +60,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t /* buffer */ case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; + default: return "unknown"; } } diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 644fae7938379..08433c014f2a3 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -43,6 +43,8 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12, + + /* buffer */ APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13, // last command_type index + 1 diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index a587cad3b23bf..637a0a8368aad 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -1,19 +1,45 @@ // needs the ggml-backend-impl.h definition // needs venus_cs.h definition +// needs +// ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer); + +static inline void +vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle); + +static inline ggml_backend_buffer_t +vn_decode_ggml_buffer(struct vn_cs_decoder *dec); + static inline void -vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *op) { - size_t tensor_size = sizeof(*op); +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + size_t tensor_size = sizeof(*tensor); - if (op->buffer || op->data || op->view_src || op->extra) { - FATAL("Cannot pass tensors with data"); + if (tensor->view_src) { + FATAL("Cannot pass tensors with view_src"); + } + if (tensor->extra) { + FATAL("Cannot pass tensors with extra"); + } + + if (tensor->src[0] && tensor->buffer) { + // not sure if the buffer needs to be updated inside the src tensors or not + FATAL("Cannot pass tensors with src and buffer"); } - vn_cs_encoder_write(enc, tensor_size, op, tensor_size); + vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); - for (int i = 0; op->src[i]; i++) { - const ggml_tensor *src_op = op->src[i]; - vn_cs_encoder_write(enc, tensor_size, src_op, tensor_size); + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. + // (could also make a copy of the tensor, and update locally.) + + if (tensor->buffer) { + apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); + vn_encode_ggml_buffer_handle(enc, &buffer_handle); + } + + for (int i = 0; tensor->src[i]; i++) { + const ggml_tensor *src_tensor = tensor->src[i]; + vn_cs_encoder_write(enc, tensor_size, src_tensor, tensor_size); } } @@ -22,15 +48,20 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { // it safe to remove the `const` qualifier here, we *do* want to // modify the shared memory data to fix the `src` pointers. - ggml_tensor *op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. + if (tensor->buffer) { + tensor->buffer = vn_decode_ggml_buffer(dec); + } - for (int i = 0; op->src[i]; i++) { - ggml_tensor *src_op = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); - op->src[i] = src_op; // overwrite op->src[i] pointer with the actual location of the src tensor + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor *src_tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = src_tensor; // overwrite op->src[i] pointer with the actual location of the src tensor } - return op; + return tensor; } /* *** ggml_backend_buffer_type_t *** */ @@ -74,3 +105,15 @@ vn_decode_ggml_buffer(struct vn_cs_decoder *dec) { return buffer; } + +/* enum ggml_status */ + +static inline void +vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) { + vn_cs_encoder_write(enc, sizeof(*status), &status, sizeof(*status)); +} + +static inline void +vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) { + vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status)); +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index a2b3277584b38..b77a0254a7a6c 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -15,6 +15,7 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu-utils.cpp virtgpu-forward-device.cpp virtgpu-forward-buffer-type.cpp + virtgpu-forward-buffer.cpp virtgpu-forward-impl.h ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 3a3d445958504..bb326570d975c 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -63,9 +63,3 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { }; /****************************************************************************************/ - -static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_remoting_buffer_context * ctx = (ggml_backend_remoting_buffer_context *)buffer->context; - NEXT; - NOT_IMPLEMENTED; -} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 25e4ed47c29a0..27a8efdea0d7e 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -3,28 +3,19 @@ #define BUFFER_TO_GPU(name) \ ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu -static enum ggml_status ggml_backend_remoting_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - UNUSED(buffer); - UNUSED(tensor); - - NEXT; - NOT_IMPLEMENTED; - STOP_HERE; - return GGML_STATUS_SUCCESS; -} - static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { - UNUSED(buffer); - IMPLEMENTED; + //IMPLEMENTED; struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - return apir_buffer_get_base(gpu, ((struct ggml_backend_remoting_buffer_context *) buffer->context)->handle); + return apir_buffer_get_base(gpu, BUFFER_TO_HANDLE(buffer)); } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { NOT_IMPLEMENTED; + STOP_HERE; + UNUSED(buffer); UNUSED(tensor); UNUSED(value); @@ -34,7 +25,6 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - NOT_IMPLEMENTED; UNUSED(buffer); @@ -100,7 +90,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, /* .get_base = */ ggml_backend_remoting_buffer_get_base, - /* .init_tensor = */ ggml_backend_remoting_buffer_init_tensor, + /* .init_tensor = */ NULL, /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor, /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 9a72139b4d2ed..a7d0d9eb69c5d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -41,7 +41,7 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - IMPLEMENTED; + //IMPLEMENTED; struct virtgpu *gpu = DEV_TO_GPU(dev); @@ -135,9 +135,7 @@ static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_t /* .context = */ nullptr, }; - // Make sure device 0 is initialized - //ggml_remoting_instance_init(); - //ggml_remoting_get_device(0); + //IMPLEMENTED; return &host_bufft; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp index 3aef4b86e2b6a..847a1ec0500fc 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -33,6 +33,7 @@ ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t UNUSED(buft); NOT_IMPLEMENTED; + STOP_HERE; return 4096; } @@ -41,16 +42,27 @@ static bool ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { UNUSED(buft); - NOT_IMPLEMENTED; + IMPLEMENTED; + STOP_HERE; return true; } +static size_t +ggml_backend_remoting_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + UNUSED(buft); + + IMPLEMENTED; + STOP_HERE; + + return SIZE_MAX; +} + const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface = { /* .get_name = */ ggml_backend_remoting_host_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_remoting_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_remoting_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_max_size = */ ggml_backend_remoting_host_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + /* .is_host = */ ggml_backend_remoting_host_buffer_type_is_host, }; diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 8072c0e356d48..dc184c300f24d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -10,6 +10,9 @@ #include "ggml-backend.h" #include "virtgpu.h" +#define BUFFER_TO_HANDLE(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle + #define NOT_IMPLEMENTED \ do { \ static bool first = true; \ @@ -29,8 +32,8 @@ #define STOP_HERE \ thks_bye() -#define IMPLEMENTED -// printf("INFO: ### reached implemented function %s\n", __func__) +#define IMPLEMENTED \ + printf("INFO: ### reached implemented function %s\n", __func__) #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl @@ -48,6 +51,14 @@ struct ggml_backend_remoting_buffer_context { struct virtgpu *gpu; }; +static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + +// return buffer?0:1; + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; + + return context->handle; +} + extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index f072f0cac81a7..a8d5b351688ff 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -87,6 +87,8 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { bool is_host; vn_decode_bool_t(decoder, &is_host); + INFO("%s: buffer is host? %d", __func__, is_host); + REMOTE_CALL_FINISH(gpu, encoder, decoder); return is_host; @@ -116,24 +118,3 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu return buffer_handle; } - -void * -apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle) { - struct vn_cs_encoder *encoder; - struct vn_cs_decoder *decoder; - - REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); - - vn_encode_apir_buffer_handle_t(encoder, &handle); - - REMOTE_CALL(gpu, encoder, decoder); - - uintptr_t base; - vn_decode_uintptr_t(decoder, &base); - - REMOTE_CALL_FINISH(gpu, encoder, decoder); - - INFO("%s: received base %p\n", __func__, (void *) base); - - return (void *) base; -} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp new file mode 100644 index 0000000000000..4ccadf98f1d7f --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -0,0 +1,22 @@ +#include "virtgpu-forward-impl.h" + +void * +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); + + vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + + REMOTE_CALL(gpu, encoder, decoder); + + uintptr_t base; + vn_decode_uintptr_t(decoder, &base); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + //INFO("%s: received base %p\n", __func__, (void *) base); + + return (void *) base; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h index 4f9af992d70c9..a7ed708851d8f 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h @@ -1,4 +1,5 @@ #include "ggml-backend-impl.h" +#include "ggml-remoting.h" #include "virtgpu.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index dda345d27c574..074af30275621 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -23,5 +23,9 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); -apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size); -void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t handle); +apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size); + +/* buffer */ + +void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle); +enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor); From 248f69590d475e04d2f2d3eb1ef7bc8e5d1036b1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 16 May 2025 10:59:21 +0200 Subject: [PATCH 055/117] build.backend: build llama-run --- build.backend.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.backend.sh b/build.backend.sh index b32c24b9ba035..086f7a4577ddd 100755 --- a/build.backend.sh +++ b/build.backend.sh @@ -4,7 +4,7 @@ rm -f READY_backend FAILED_backend echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc - export LD_PRELOAD=/tmp/isatty.so -cmake --build ../build.remoting-backend --parallel 8 --target llama-cli "$@" +cmake --build ../build.remoting-backend --parallel 8 --target llama-run "$@" if [[ $? == 0 ]]; then touch READY_backend From 2e70ad0965387831b5b35304a9cc335c08400128 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 16 May 2025 10:59:42 +0200 Subject: [PATCH 056/117] ggml: src: ggml-remotingbackend/shared/venus_cs: fix memory corruption caused by vn_decode ... --- ggml/src/ggml-remotingbackend/shared/venus_cs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index d2b85c8f82196..c8149a5b58a29 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -444,13 +444,13 @@ vn_cs_decoder_alloc_array(struct vn_cs_decoder *dec, size_t size, size_t count) static inline void vn_encode_bool_t(struct vn_cs_encoder *enc, const bool *val) { - vn_encode(enc, sizeof(int), val, sizeof(int)); + vn_encode(enc, sizeof(int), val, sizeof(bool)); } static inline void vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) { - vn_decode(dec, sizeof(int), val, sizeof(int)); + vn_decode(dec, sizeof(int), val, sizeof(bool)); } /* apir_buffer_type_handle_t */ From 4d7d6dc0e4ce44fc811b8afeb26fe091207d3bbf Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 16 May 2025 11:00:30 +0200 Subject: [PATCH 057/117] ggml: src: ggml-remotingfrontend/ggml-backend-device: handcode the caps --- .../src/ggml-remotingfrontend/ggml-backend-device.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index a7d0d9eb69c5d..ef48bd6fae96e 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -74,24 +74,27 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { IMPLEMENTED; - struct virtgpu *gpu = DEV_TO_GPU(dev); - props->name = ggml_backend_remoting_device_get_name(dev); props->description = ggml_backend_remoting_device_get_description(dev); props->type = ggml_backend_remoting_device_get_type(dev); ggml_backend_remoting_device_get_memory(dev, &props->memory_free, &props->memory_total); +#if 0 + struct virtgpu *gpu = DEV_TO_GPU(dev); apir_device_get_props(gpu, &props->caps.async, &props->caps.host_buffer, &props->caps.buffer_from_host_ptr, &props->caps.events ); - +#else // ignore the actual backend answers and set it as we provide it in // the API Remoting frontend - props->caps.host_buffer = true; + props->caps.async = false; + props->caps.host_buffer = false; props->caps.buffer_from_host_ptr = false; + props->caps.events = false; +#endif INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", __func__, props->caps.async, props->caps.host_buffer, From 6f0578fd2a0002a3c62527a4bdb5444d6fb1c40e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 16 May 2025 14:14:53 +0200 Subject: [PATCH 058/117] remoting: implement buffer_set_tensor --- .../backend-dispatched-buffer.cpp | 28 +++++++++++++++++++ .../ggml-remotingbackend/backend-dispatched.h | 3 ++ .../shared/apir_backend.h | 3 +- .../ggml-backend-buffer.cpp | 14 ++++++---- .../virtgpu-forward-buffer.cpp | 23 +++++++++++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 2 ++ 6 files changed, 66 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index 095a95f1a6fae..85b313f52b58e 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -18,3 +18,31 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { return 0; } + +uint32_t +backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec); + + void *data; + vn_decode_uintptr_t(dec, (uintptr_t *) &data); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer, tensor, data, offset, size); + + //buffer->iface.set_tensor(buffer, tensor, data, offset, size); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 460d3a1af4e05..ce8cbc98eea24 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -37,6 +37,7 @@ uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_c /* buffer */ uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { @@ -60,6 +61,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t /* buffer */ case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor"; default: return "unknown"; } @@ -85,4 +87,5 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC /* buffer */ [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base, + [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 08433c014f2a3..cbc181c0089d4 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -46,7 +46,8 @@ typedef enum ApirBackendCommandType { /* buffer */ APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13, + APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 14, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 15, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 27a8efdea0d7e..aa0730efa55b9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -25,13 +25,15 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - NOT_IMPLEMENTED; + BEING_IMPLEMENTED; - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size); + + apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size); + + return; } static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 4ccadf98f1d7f..550a849dcd2f0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -20,3 +20,26 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { return (void *) base; } + +void +apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + ggml_tensor *tensor, const void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu"); + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); + + vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_ggml_tensor(encoder, tensor); + vn_encode_uintptr_t(encoder, (uintptr_t *) &data); + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 074af30275621..2790adbb62454 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -29,3 +29,5 @@ apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_bac void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle); enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor); +void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + ggml_tensor *tensor, const void *data, size_t offset, size_t size); From 6f396ccc658c3ecc9cfa039114f209b5372a1b19 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 16 May 2025 14:15:06 +0200 Subject: [PATCH 059/117] remoting: improve --- .../ggml-remotingfrontend/ggml-backend-buffer-type.cpp | 2 +- ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp | 2 +- ggml/src/ggml-remotingfrontend/ggml-backend.cpp | 10 +++++----- .../virtgpu-forward-buffer-type.cpp | 5 ++--- .../ggml-remotingfrontend/virtgpu-forward-buffer.cpp | 3 ++- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index bb326570d975c..631db50b309cc 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -21,7 +21,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - IMPLEMENTED; + //IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index aa0730efa55b9..069886358f6f0 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -58,7 +58,7 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer } static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - NOT_IMPLEMENTED; + BEING_IMPLEMENTED; UNUSED(dst); UNUSED(c); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 6c2f2b947e10b..4bd321b5fc5c9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -9,15 +9,17 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { } static void ggml_backend_remoting_free(ggml_backend_t backend) { - UNUSED(backend); + IMPLEMENTED; - NOT_IMPLEMENTED; + delete backend; } static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { UNUSED(backend); UNUSED(cgraph); + NOT_IMPLEMENTED; + return GGML_STATUS_SUCCESS; } @@ -38,9 +40,7 @@ static ggml_backend_i ggml_backend_remoting_interface = { }; static ggml_guid_t ggml_backend_remoting_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - - NOT_IMPLEMENTED; + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x14, 0x03, 0x86, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; return &guid; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index a8d5b351688ff..645780715a133 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -19,7 +19,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) } vn_decode_char_array(decoder, string, string_size); - INFO("%s: Forward BUFT NAME --> %s", __func__, string); + //INFO("%s: Forward BUFT NAME --> %s", __func__, string); /* *** */ @@ -99,7 +99,7 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; - INFO("%s: allocate device memory (%lu)\n", __func__, size); + INFO("%s: allocate device memory (%lu)", __func__, size); REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); @@ -112,7 +112,6 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu apir_buffer_handle_t buffer_handle; vn_decode_apir_buffer_handle_t(decoder, &buffer_handle); - INFO("%s: received buffer handle %p\n", __func__, (void *) buffer_handle); REMOTE_CALL_FINISH(gpu, encoder, decoder); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 550a849dcd2f0..ad65804ab27b0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -27,7 +27,8 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; - INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu"); + INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", + buffer_handle, tensor, data, offset, size); REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); From d40a3da85fc6c3eab5bfe386fa5a366c5ab6c2ff Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:21:46 +0200 Subject: [PATCH 060/117] remotingbackend: accept the virgl context argument --- .../backend-dispatched-buffer-type.cpp | 15 ++++++--- .../backend-dispatched-buffer.cpp | 6 ++-- .../backend-dispatched-device.cpp | 25 ++++++++++----- .../ggml-remotingbackend/backend-dispatched.h | 32 +++++++++---------- .../ggml-remotingbackend/backend-internal.h | 2 +- ggml/src/ggml-remotingbackend/backend.cpp | 4 +-- .../shared/apir_backend.h | 16 +++++++++- .../shared/venus_cs_ggml.h | 12 +++++++ 8 files changed, 77 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index da8a50d67ccb7..f09592ea5df43 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -7,7 +7,8 @@ #include "ggml-backend.h" uint32_t -backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); @@ -21,7 +22,8 @@ backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *de } uint32_t -backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); @@ -32,7 +34,8 @@ backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decode } uint32_t -backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); @@ -43,7 +46,8 @@ backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder } uint32_t -backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); @@ -54,7 +58,8 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec } uint32_t -backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index 85b313f52b58e..ff35a492cf100 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -7,7 +7,8 @@ #include "ggml-backend.h" uint32_t -backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); ggml_backend_buffer_t buffer; buffer = vn_decode_ggml_buffer(dec); @@ -20,7 +21,8 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { } uint32_t -backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(enc); ggml_backend_buffer_t buffer; diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 21b603a3160b6..ba2ec479a95c0 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -6,7 +6,9 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(ctx); UNUSED(dec); int32_t dev_count = reg->iface.get_device_count(reg); @@ -15,7 +17,8 @@ uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_de return 0; } -uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); const char *string = dev->iface.get_name(dev); @@ -28,7 +31,8 @@ uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder } uint32_t -backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); const char *string = dev->iface.get_description(dev); @@ -41,7 +45,8 @@ backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder * } uint32_t -backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); uint32_t type = dev->iface.get_type(dev); @@ -51,7 +56,8 @@ backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { } uint32_t -backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); size_t free, total; @@ -64,7 +70,8 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) } uint32_t -backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); bool supports_op = dev->iface.supports_op(dev, op); @@ -75,7 +82,8 @@ backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) } uint32_t -backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); @@ -87,7 +95,8 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder * } uint32_t -backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec) { +backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); UNUSED(dec); struct ggml_backend_dev_props props; diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index ce8cbc98eea24..faa3dacfc2297 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -13,31 +13,31 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p); -typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +typedef uint32_t (*backend_dispatch_t)(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* *** */ -uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_reg_get_device_count(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* device */ -uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_device_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_description(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* buffer-type */ -uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* buffer */ -uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); -uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec); +uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h index 7fd803c2aa5dd..5c29e18d4596a 100644 --- a/ggml/src/ggml-remotingbackend/backend-internal.h +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -21,7 +21,7 @@ extern ggml_backend_dev_t dev; extern "C" { uint32_t apir_backend_initialize(); void apir_backend_deinit(void); - uint32_t apir_backend_dispatcher(uint32_t cmd_type, + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, char *dec_cur, const char *dec_end, char *enc_cur, const char *enc_end, char **enc_cur_after); diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index c32353586a10b..c9d784941d514 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -58,7 +58,7 @@ extern "C" { return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); } - uint32_t apir_backend_dispatcher(uint32_t cmd_type, + uint32_t apir_backend_dispatcher(uint32_t cmd_type, struct virgl_apir_context *ctx, char *dec_cur, const char *dec_end, char *enc_cur, const char *enc_end, char **enc_cur_after) { @@ -82,7 +82,7 @@ extern "C" { } backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; - uint32_t ret = forward_fct(enc, dec); + uint32_t ret = forward_fct(enc, dec, ctx); *enc_cur_after = enc->cur; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index cbc181c0089d4..96bbb59fda14c 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -20,7 +20,10 @@ typedef uintptr_t apir_buffer_handle_t; typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); -typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, +struct vn_dispatch_context; +struct virgl_apir_context; + +typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, struct virgl_apir_context *ctx, char *dec_cur, const char *dec_end, char *enc_cur, const char *enc_end, char **enc_cur_after @@ -51,3 +54,14 @@ typedef enum ApirBackendCommandType { // last command_type index + 1 APIR_BACKEND_DISPATCH_TABLE_COUNT = 15, } ApirBackendCommandType; + + +struct virgl_apir_callbacks { + void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id); +} ; + +struct virgl_apir_context { + struct vn_dispatch_context *virgl_ctx; + + struct virgl_apir_callbacks iface; +}; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 637a0a8368aad..8a73537a45204 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -117,3 +117,15 @@ static inline void vn_decode_ggml_status(struct vn_cs_decoder *dec, enum ggml_status *status) { vn_cs_decoder_read(dec, sizeof(*status), status, sizeof(*status)); } + +/* vn_renderer_shmem */ + +static inline void +vn_encode_virtgpu_shmem_res_id(struct vn_cs_encoder *enc, uint32_t shmem_res_id) { + vn_encode_uint32_t(enc, &shmem_res_id); +} + +static inline void +vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) { + vn_decode_uint32_t(dec, shmem_res_id); +} From b815c1a3c5dcb1c41f991857699e7e59b4707389 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:23:31 +0200 Subject: [PATCH 061/117] remotingfrontend: implement buffer_set_tensor with a guest shared page --- .../ggml-backend-buffer.cpp | 15 +++++++++++---- .../virtgpu-forward-buffer.cpp | 17 ++++++++++++++--- ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 1 + ggml/src/ggml-remotingfrontend/virtgpu-shm.h | 2 ++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 069886358f6f0..25f6e78436d8c 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -23,14 +23,21 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf UNUSED(size); } - static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - BEING_IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - +#if 0 INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size); - +#endif +#if 0 + void **addr = (void **)(uintptr_t)data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size); return; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index ad65804ab27b0..dc991f84c07cc 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -21,20 +21,29 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { return (void *) base; } + void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; - +#if 0 INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", buffer_handle, tensor, data, offset, size); - +#endif REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); vn_encode_ggml_tensor(encoder, tensor); - vn_encode_uintptr_t(encoder, (uintptr_t *) &data); + + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + + memcpy(shmem->mmap_ptr, data, size); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + vn_encode_size_t(encoder, &offset); vn_encode_size_t(encoder, &size); @@ -42,5 +51,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_FINISH(gpu, encoder, decoder); + virtgpu_shmem_destroy(gpu, shmem->shmem); + return; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp index bd1568add1752..d5e602c97be66 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -102,6 +102,7 @@ virtgpu_shmem_create(struct virtgpu *gpu, size_t size) shmem->base.mmap_ptr = ptr; shmem->base.refcount.count = 1; shmem->base.gem_handle = gem_handle; + shmem->base.shmem = shmem; return &shmem->base; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h index 3bdc5ca700f1b..e5770b1916886 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h @@ -25,6 +25,8 @@ struct vn_renderer_shmem { int64_t cache_timestamp; uint32_t gem_handle; + + struct virtgpu_shmem *shmem; }; struct vn_renderer_shmem *virtgpu_shmem_create(struct virtgpu *gpu, size_t size); From b24fbe79623cca592f26293f999b7e1a527afed3 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:23:54 +0200 Subject: [PATCH 062/117] ggml: src: ggml-remotingbackend/backend-dispatched-buffer: implement buffer_set_tensor with the guest shared page --- .../backend-dispatched-buffer.cpp | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index ff35a492cf100..c217cecbd2aa0 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -32,8 +32,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, // safe to remove the const qualifier here tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec); - void *data; - vn_decode_uintptr_t(dec, (uintptr_t *) &data); + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); size_t offset; vn_decode_size_t(dec, &offset); @@ -41,10 +41,26 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, size_t size; vn_decode_size_t(dec, &size); + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + +#if 0 INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", - buffer, tensor, data, offset, size); + buffer, tensor, shmem_data, offset, size); +#endif +#if 0 + void **addr = (void **)(uintptr_t)shmem_data; + for (int i = 0; i <= 10; i++) { + INFO("%s: %p | %llx", __func__, addr, *addr); + addr++; + } + INFO("\n"); +#endif - //buffer->iface.set_tensor(buffer, tensor, data, offset, size); + buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size); return 0; } From 3a20164a1f163a58454fda5df2c8dee324613d42 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:24:16 +0200 Subject: [PATCH 063/117] remotingfrontend: add more STOP_HERE calls --- ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 25f6e78436d8c..847a61297be8b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -46,6 +46,8 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { NOT_IMPLEMENTED; + STOP_HERE; + UNUSED(buffer); UNUSED(tensor); UNUSED(data); @@ -57,6 +59,8 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { NOT_IMPLEMENTED; + STOP_HERE; + return true; UNUSED(buffer); @@ -76,6 +80,8 @@ static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uin static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { NOT_IMPLEMENTED; + STOP_HERE; + UNUSED(ctx); UNUSED(dst); UNUSED(c); @@ -88,12 +94,16 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin UNUSED(value); NOT_IMPLEMENTED; + + STOP_HERE; } static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { UNUSED(buffer); NOT_IMPLEMENTED; + + STOP_HERE; } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { From c5608716989673f9d8cce82f2734917801cf2a78 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:24:30 +0200 Subject: [PATCH 064/117] remotingfrontend: add IMPLEMENTED_ONCE --- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index dc184c300f24d..ecc1e98217378 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -32,9 +32,18 @@ #define STOP_HERE \ thks_bye() -#define IMPLEMENTED \ +#define IMPLEMENTED \ printf("INFO: ### reached implemented function %s\n", __func__) +#define IMPLEMENTED_ONCE \ + do { \ + static bool first = true; \ + if (first) { \ + printf("INFO: ### reached implemented function %s\n", __func__); \ + first = false; \ + } \ + } while(0) + #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl struct ggml_backend_remoting_device_context { From 142924b3d173df4c473816610d40c2e44425d02a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 11:24:43 +0200 Subject: [PATCH 065/117] ggml: src: ggml-remotingfrontend/virtgpu-shm: reduce the verbosity --- ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp index d5e602c97be66..935b1028d2ab0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -53,7 +53,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) .handle = gem_handle, .pad = 0, }; - printf("virtgpu_ioctl_map(%ld)\n", size); + if (virtgpu_ioctl(gpu, DRM_IOCTL_VIRTGPU_MAP, &args)) return NULL; @@ -61,7 +61,7 @@ virtgpu_ioctl_map(struct virtgpu *gpu, uint32_t gem_handle, size_t size) args.offset); if (ptr == MAP_FAILED) return NULL; - printf("virtgpu_ioctl_map(%ld) --> %p | %p\n", size, ptr, *(void **)ptr); + return ptr; } From 9913b7f34e075348e43ccf887d696fdca9da3046 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 14:54:30 +0200 Subject: [PATCH 066/117] ggml: src: ggml-remotingfrontend/ggml-backend-reg: refactor to untight the reg and the device --- .../ggml-backend-reg.cpp | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index eeac6c59db670..8b5eb5bbb189b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -40,24 +40,27 @@ static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) IMPLEMENTED; - struct virtgpu *gpu = apir_initialize(); - if (!gpu) { - WARNING("apir_initialize failed :/"); - return 0; - } - - return apir_device_get_count(gpu); + return ggml_backend_remoting_get_device_count(); } -static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; +static std::vector devices; +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) { + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { IMPLEMENTED; + if (devices.size() > 0) { + INFO("%s: already initialized\n", __func__); + } + struct virtgpu *gpu = apir_initialize(); if (!gpu) { - WARNING("apir_initialize failed :/"); - return 0; + FATAL("apir_initialize failed :/"); + return; } static bool initialized = false; @@ -67,7 +70,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ std::lock_guard lock(mutex); if (!initialized) { - for (size_t i = 0; i < ggml_backend_remoting_reg_get_device_count(reg); i++) { + for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) { ggml_backend_remoting_device_context *ctx = new ggml_backend_remoting_device_context; char desc[256] = "API Remoting device"; @@ -85,9 +88,14 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ initialized = true; } } +} - GGML_ASSERT(device < devices.size()); - return devices[device]; +static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_t reg, size_t device) { + UNUSED(reg); + + IMPLEMENTED; + + return ggml_backend_remoting_get_device(device); } static const char *ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { @@ -109,6 +117,7 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { FATAL("apir_initialize failed :/"); return NULL; } + static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, /* .iface = */ ggml_backend_remoting_reg_i, @@ -116,5 +125,8 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { }; RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); + + ggml_backend_remoting_reg_init_devices(®); + return ® } From ede86288480281adef9ea8fff8685d88e7398e93 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 14:57:13 +0200 Subject: [PATCH 067/117] ggml: src: ggml-remotingfrontend/ggml-remoting: remove draft code --- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index ecc1e98217378..0d8912741ba0b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -61,8 +61,6 @@ struct ggml_backend_remoting_buffer_context { }; static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { - -// return buffer?0:1; struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; return context->handle; From 1927cf0a4a4cd71cee7858a1c0d77b729d1a292c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 19 May 2025 15:13:45 +0200 Subject: [PATCH 068/117] remotingfrontend: add host buffer memory allocation --- .../ggml-backend-host-buffer-type.cpp | 56 +++++++++++++++++-- .../src/ggml-remotingfrontend/ggml-remoting.h | 6 ++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp index 847a1ec0500fc..faf051fcc8e3a 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -5,18 +5,64 @@ extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +static void +ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + BEING_IMPLEMENTED; + + void *ptr = buffer->context; + + if (ptr == nullptr) { + return; + } + struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); + + struct vn_renderer_shmem *shmem; + size_t index; + + for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(device_ctx->shared_memory[i]) /* ptr */; + const uint8_t* endr = addr + std::get<1>(device_ctx->shared_memory[i]) /* size */; + if (ptr >= addr && ptr < endr) { + shmem = std::get<2>(device_ctx->shared_memory[i]) /* shmem */; + index = i; + break; + } + } + + if (shmem == nullptr) { + WARNING("failed to free host shared memory: memory not in map\n"); + return; + } + + virtgpu_shmem_destroy(device_ctx->gpu, shmem->shmem); + + device_ctx->shared_memory.erase(device_ctx->shared_memory.begin() + index); +} + static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { BEING_IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); - UNUSED(gpu); - void *ctx = NULL; + struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); - NOT_IMPLEMENTED; + size += 32; // Behave like the CPU buffer type (dixit ggml-vulkan) - STOP_HERE; - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, ctx, size); + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared host buffer :/"); + } + + void *ptr = shmem->mmap_ptr; + + device_ctx->shared_memory.push_back(std::make_tuple(ptr, size, shmem)); + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; + + return buffer; } static const char * diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 0d8912741ba0b..8715e60209e8c 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -13,6 +13,9 @@ #define BUFFER_TO_HANDLE(name) \ ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle +#define GET_DEVICE_CONTEXT() \ + (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \ + #define NOT_IMPLEMENTED \ do { \ static bool first = true; \ @@ -51,6 +54,8 @@ struct ggml_backend_remoting_device_context { std::string name; std::string description; + std::vector> shared_memory; + struct virtgpu *gpu; }; @@ -71,6 +76,7 @@ extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device); ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); ggml_backend_t ggml_backend_remoting_device_init(ggml_backend_dev_t dev, const char * params); ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev); From d3541665a8739870f70f4781240ed03ad134d9ad Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 09:38:31 +0200 Subject: [PATCH 069/117] remoting: add clear buffer and get_tensor --- .../backend-dispatched-buffer.cpp | 53 ++++++++++++++++++- .../ggml-remotingbackend/backend-dispatched.h | 6 +++ .../shared/apir_backend.h | 4 +- .../ggml-remotingbackend/shared/venus_cs.h | 14 +++++ .../shared/venus_cs_ggml.h | 40 ++++++++++---- .../ggml-backend-buffer.cpp | 43 ++++----------- .../ggml-backend-device.cpp | 29 ++++++---- .../ggml-backend-host-buffer-type.cpp | 12 ++--- .../ggml-backend-reg.cpp | 2 +- .../ggml-remotingfrontend/ggml-backend.cpp | 3 +- .../virtgpu-forward-buffer.cpp | 44 ++++++++++++++- .../ggml-remotingfrontend/virtgpu-forward.h | 4 ++ 12 files changed, 189 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index c217cecbd2aa0..8dfce029af40e 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -52,7 +52,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, buffer, tensor, shmem_data, offset, size); #endif #if 0 - void **addr = (void **)(uintptr_t)shmem_data; + void **addr = (void **)(uintptr_t) shmem_data; for (int i = 0; i <= 10; i++) { INFO("%s: %p | %llx", __func__, addr, *addr); addr++; @@ -64,3 +64,54 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, return 0; } + +uint32_t +backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + ggml_tensor *tensor; + // safe to remove the const qualifier here + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + size_t offset; + vn_decode_size_t(dec, &offset); + + size_t size; + vn_decode_size_t(dec, &size); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + INFO("GET_TENSOR"); + + UNUSED(buffer); + UNUSED(tensor); + buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); + + return 0; +} + +uint32_t +backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + uint8_t value; + vn_decode_uint8_t(dec, &value); + + buffer->iface.clear(buffer, value); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index faa3dacfc2297..76f1bb8a647b8 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -38,6 +38,8 @@ uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_c /* buffer */ uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { @@ -62,6 +64,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t /* buffer */ case APIR_COMMAND_TYPE_BUFFER_GET_BASE: return "backend_buffer_get_base"; case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear"; default: return "unknown"; } @@ -88,4 +92,6 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC /* buffer */ [APIR_COMMAND_TYPE_BUFFER_GET_BASE] = backend_buffer_get_base, [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, + [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor, + [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear, }; diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 96bbb59fda14c..f3eff8874ed90 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -50,9 +50,11 @@ typedef enum ApirBackendCommandType { /* buffer */ APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14, + APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15, + APIR_COMMAND_TYPE_BUFFER_CLEAR = 16, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 15, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 17, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index c8149a5b58a29..82c4091fded09 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -116,6 +116,20 @@ vn_encode(struct vn_cs_encoder *enc, size_t size, const void *data, size_t data_ * typed encode/decode */ +/* uint8_t */ + +static inline void +vn_encode_uint8_t(struct vn_cs_encoder *enc, const uint8_t *val) +{ + vn_encode(enc, sizeof(int), val, sizeof(*val)); +} + +static inline void +vn_decode_uint8_t(struct vn_cs_decoder *dec, uint8_t *val) +{ + vn_decode(dec, sizeof(int), val, sizeof(*val)); +} + /* uint64_t */ static inline size_t diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 8a73537a45204..2aa87b62fb338 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -14,16 +14,17 @@ static inline void vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { size_t tensor_size = sizeof(*tensor); - if (tensor->view_src) { - FATAL("Cannot pass tensors with view_src"); - } if (tensor->extra) { FATAL("Cannot pass tensors with extra"); } if (tensor->src[0] && tensor->buffer) { - // not sure if the buffer needs to be updated inside the src tensors or not - FATAL("Cannot pass tensors with src and buffer"); + static int first = 1; + if (first) { + // not sure if the buffer needs to be updated inside the src tensors or not + WARNING("Cannot pass tensors with src and buffer"); + first = 0; + } } vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); @@ -37,9 +38,20 @@ vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { vn_encode_ggml_buffer_handle(enc, &buffer_handle); } + if (tensor->view_src) { + vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); + } + for (int i = 0; tensor->src[i]; i++) { - const ggml_tensor *src_tensor = tensor->src[i]; - vn_cs_encoder_write(enc, tensor_size, src_tensor, tensor_size); + const ggml_tensor *tensor_src = tensor->src[i]; + vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size); + +#if 0 + if (tensor_src->buffer) { + apir_buffer_handle_t src_buffer_handle = ggml_buffer_to_apir_handle(tensor_src->buffer); + vn_encode_ggml_buffer_handle(enc, &src_buffer_handle); + } +#endif } } @@ -56,9 +68,19 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { tensor->buffer = vn_decode_ggml_buffer(dec); } + if (tensor->view_src) { + ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->view_src = tensor_view_src; + } + for (int i = 0; tensor->src[i]; i++) { - ggml_tensor *src_tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); - tensor->src[i] = src_tensor; // overwrite op->src[i] pointer with the actual location of the src tensor + ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor +#if 0 + if (tensor_src->buffer) { + tensor_src->buffer = vn_decode_ggml_buffer(dec); + } +#endif } return tensor; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 847a61297be8b..ed2c749958279 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -44,15 +44,10 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer } static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - NOT_IMPLEMENTED; - - STOP_HERE; + IMPLEMENTED_ONCE; + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - UNUSED(buffer); - UNUSED(tensor); - UNUSED(data); - UNUSED(offset); - UNUSED(size); + apir_buffer_get_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size); } @@ -68,34 +63,14 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer UNUSED(dst); } -static void ggml_remoting_buffer_memset(remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - BEING_IMPLEMENTED; - - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - -static void ggml_remoting_buffer_memset_async(remoting_context& ctx, remoting_buffer& dst, size_t offset, uint32_t c, size_t size) { - NOT_IMPLEMENTED; - - STOP_HERE; - - UNUSED(ctx); - UNUSED(dst); - UNUSED(c); - UNUSED(size); - UNUSED(offset); -} - static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - UNUSED(buffer); - UNUSED(value); + IMPLEMENTED; - NOT_IMPLEMENTED; + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - STOP_HERE; + apir_buffer_clear(gpu, BUFFER_TO_HANDLE(buffer), value); + + return; } static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -103,7 +78,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe NOT_IMPLEMENTED; - STOP_HERE; + STOP_HERE; } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index ef48bd6fae96e..ad8be7d0bc69e 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -5,7 +5,7 @@ static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { - IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = DEV_TO_GPU(dev); @@ -41,7 +41,7 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - //IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = DEV_TO_GPU(dev); @@ -50,24 +50,33 @@ ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tens static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + IMPLEMENTED_ONCE; + +#if 1 + bool supported = buft->device == dev; + if (!supported) { + //WARNING("%s: unsupported buffer type (%s). Double check.", __func__, buft->iface.get_name(buft)); + } + + return supported; +#else UNUSED(dev); UNUSED(buft); - NOT_IMPLEMENTED; - return true; +#endif } static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; + IMPLEMENTED_ONCE; - NOT_IMPLEMENTED; + UNUSED(dev); + UNUSED(op); - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + // related to supports_buft, need to confirm - UNUSED(dev); + return false; // same as ggml-metal } static void @@ -103,7 +112,7 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { - IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = DEV_TO_GPU(dev); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp index faf051fcc8e3a..a355e9aebbbf9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -41,7 +41,7 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - BEING_IMPLEMENTED; + IMPLEMENTED; struct virtgpu *gpu = BUFT_TO_GPU(buft); struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); @@ -69,7 +69,7 @@ static const char * ggml_backend_remoting_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { UNUSED(buft); - IMPLEMENTED; + IMPLEMENTED_ONCE; return "GUEST host buffer"; } @@ -78,18 +78,16 @@ static size_t ggml_backend_remoting_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { UNUSED(buft); - NOT_IMPLEMENTED; - STOP_HERE; + IMPLEMENTED_ONCE; - return 4096; + return 64; // not 100% sure ... } static bool ggml_backend_remoting_host_buffer_type_is_host(ggml_backend_buffer_type_t buft) { UNUSED(buft); - IMPLEMENTED; - STOP_HERE; + IMPLEMENTED_ONCE; return true; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 8b5eb5bbb189b..ca98528fd7e08 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -54,7 +54,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { IMPLEMENTED; if (devices.size() > 0) { - INFO("%s: already initialized\n", __func__); + INFO("%s: already initialized", __func__); } struct virtgpu *gpu = apir_initialize(); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 4bd321b5fc5c9..97a0e53856df9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -3,7 +3,7 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); - NOT_IMPLEMENTED; + IMPLEMENTED; return "API Remoting backend"; } @@ -18,6 +18,7 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g UNUSED(backend); UNUSED(cgraph); + NEXT; NOT_IMPLEMENTED; return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index dc991f84c07cc..f6c2ccb33b9e9 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -21,7 +21,6 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { return (void *) base; } - void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { @@ -55,3 +54,46 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, return; } + +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); + + vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_ggml_tensor(encoder, tensor); + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + vn_encode_size_t(encoder, &offset); + vn_encode_size_t(encoder, &size); + + REMOTE_CALL(gpu, encoder, decoder); + + memcpy(data, shmem->mmap_ptr, size); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + virtgpu_shmem_destroy(gpu, shmem->shmem); +} + +void +apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + uint8_t value) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR); + + vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_uint8_t(encoder, &value); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 2790adbb62454..6511850e666e9 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -31,3 +31,7 @@ void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_hand enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor); void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor, const void *data, size_t offset, size_t size); +void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + const ggml_tensor *tensor, void *data, size_t offset, size_t size); +void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, + uint8_t value); From 49bb02063ebb89d20a0c03b876575b818493a005 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 10:49:32 +0200 Subject: [PATCH 070/117] remoting: add skeleton for graph_compute method --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 1 + .../backend-dispatched-backend.cpp | 21 ++++++++++++++++++ .../ggml-remotingbackend/backend-dispatched.h | 8 +++++++ .../ggml-remotingbackend/backend-internal.h | 1 + .../shared/apir_backend.h | 5 ++++- .../shared/venus_cs_ggml.h | 16 ++++++++++++++ ggml/src/ggml-remotingfrontend/CMakeLists.txt | 1 + .../ggml-backend-device.cpp | 3 --- .../ggml-remotingfrontend/ggml-backend.cpp | 7 +++--- .../src/ggml-remotingfrontend/ggml-remoting.h | 3 +++ .../virtgpu-forward-backend.cpp | 22 +++++++++++++++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 4 ++++ 12 files changed, 85 insertions(+), 7 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp create mode 100644 ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index feca344c90a64..3d7255faf237f 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -6,6 +6,7 @@ message(STATUS "Enable API Remoting backend") ggml_add_backend_library(ggml-remotingbackend backend.cpp backend-dispatched.cpp + backend-dispatched-backend.cpp backend-dispatched-device.cpp backend-dispatched-buffer.cpp backend-dispatched-buffer-type.cpp diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp new file mode 100644 index 0000000000000..9a6fb941469aa --- /dev/null +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -0,0 +1,21 @@ +#include +#include "backend-internal.h" +#include "backend-dispatched.h" + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +uint32_t +backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec); + + ggml_status status = bck->iface.graph_compute(bck, cgraph); + + vn_encode_ggml_status(enc, &status); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 76f1bb8a647b8..5464f56baf152 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -41,6 +41,9 @@ uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decod uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +/* backend */ +uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); + static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) { switch (type) { @@ -67,6 +70,8 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor"; case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear"; + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute"; default: return "unknown"; } } @@ -94,4 +99,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor, [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear, + + /* backend */ + [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute, }; diff --git a/ggml/src/ggml-remotingbackend/backend-internal.h b/ggml/src/ggml-remotingbackend/backend-internal.h index 5c29e18d4596a..41bc42dbc0e36 100644 --- a/ggml/src/ggml-remotingbackend/backend-internal.h +++ b/ggml/src/ggml-remotingbackend/backend-internal.h @@ -8,6 +8,7 @@ extern ggml_backend_reg_t reg; extern ggml_backend_dev_t dev; +extern ggml_backend_t bck; #define NOT_IMPLEMENTED \ do { \ diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index f3eff8874ed90..2608909b7541e 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -53,8 +53,11 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15, APIR_COMMAND_TYPE_BUFFER_CLEAR = 16, + /* backend */ + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 17, + // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 17, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 18, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 2aa87b62fb338..c769d9d860a20 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -151,3 +151,19 @@ static inline void vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id) { vn_decode_uint32_t(dec, shmem_res_id); } + +/* ggml_cgraph */ + +static inline void +vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph) { + UNUSED(enc); + UNUSED(cgraph); + +} + +static inline ggml_cgraph * +vn_decode_ggml_cgraph(struct vn_cs_decoder *dec) { + UNUSED(dec); + + return NULL; +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index b77a0254a7a6c..a13f48b7ef81b 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -16,6 +16,7 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu-forward-device.cpp virtgpu-forward-buffer-type.cpp virtgpu-forward-buffer.cpp + virtgpu-forward-backend.cpp virtgpu-forward-impl.h ../../include/ggml-remoting-frontend.h ) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index ad8be7d0bc69e..6d51643962d80 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -1,8 +1,5 @@ #include "ggml-remoting.h" -#define DEV_TO_GPU(name) \ - ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu - static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { IMPLEMENTED_ONCE; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 97a0e53856df9..190fde76a30bc 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -18,10 +18,11 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g UNUSED(backend); UNUSED(cgraph); - NEXT; - NOT_IMPLEMENTED; + struct virtgpu *gpu = DEV_TO_GPU(backend->device); - return GGML_STATUS_SUCCESS; + BEING_IMPLEMENTED; + + return apir_backend_graph_compute(gpu, cgraph); } static ggml_backend_i ggml_backend_remoting_interface = { diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 8715e60209e8c..d51afbe19dc78 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -10,6 +10,9 @@ #include "ggml-backend.h" #include "virtgpu.h" +#define DEV_TO_GPU(name) \ + ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu + #define BUFFER_TO_HANDLE(name) \ ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp new file mode 100644 index 0000000000000..4a8214265209c --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -0,0 +1,22 @@ +#include "virtgpu-forward-impl.h" + +ggml_status +apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { + ggml_status status; + UNUSED(cgraph); + + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); + + vn_encode_ggml_cgraph(encoder, cgraph); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_ggml_status(decoder, &status); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return status; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 6511850e666e9..d59cd754eb803 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -35,3 +35,7 @@ void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_han const ggml_tensor *tensor, void *data, size_t offset, size_t size); void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, uint8_t value); + +/* backend */ + +ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph); From 8edd5e6a9af276a20ec39d908a5b13147e2d394c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 11:55:16 +0200 Subject: [PATCH 071/117] remoting: continue the compute_graph skeleton --- .../backend-dispatched-backend.cpp | 14 ++++++- .../ggml-remotingbackend/shared/venus_cs.h | 25 ++++++++++++ .../shared/venus_cs_ggml.h | 40 +++++++++++++++++-- .../virtgpu-forward-backend.cpp | 14 ++++++- 4 files changed, 87 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index 9a6fb941469aa..60f5708528d12 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -11,7 +11,19 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru UNUSED(ctx); UNUSED(enc); - ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec); + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + size_t shmem_size; + vn_decode_size_t(dec, &shmem_size); + + struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size); + + ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec); ggml_status status = bck->iface.graph_compute(bck, cgraph); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 82c4091fded09..a780cb6e40fda 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -20,6 +20,31 @@ struct vn_cs_decoder { const char* end; }; +/* + * new encoder and decoder + */ + +static struct vn_cs_decoder +vn_cs_new_decoder(const char *ptr, size_t size) { + struct vn_cs_decoder dec = { + .cur = ptr, + .end = ptr + size, + }; + + return dec; +} + +static struct vn_cs_encoder +vn_cs_new_encoder(char *ptr, size_t size) { + struct vn_cs_encoder enc = { + .cur = ptr, + .start = ptr, + .end = ptr + size, + }; + + return enc; +} + /* * encode peek */ diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index c769d9d860a20..3a72b906a2634 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -154,16 +154,48 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id /* ggml_cgraph */ +static inline size_t +vn_encode_sizeof_ggml_cgraph(ggml_cgraph *cgraph) { + return sizeof(*cgraph); +} + static inline void -vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph) { +vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) { UNUSED(enc); UNUSED(cgraph); + if (cgraph->n_leafs) { + FATAL("Cannot pass cgraphs with leaves"); + } + if (cgraph->size) { + FATAL("Cannot pass cgraphs with size"); + } + if (cgraph->grads) { + FATAL("Cannot pass cgraphs with grads"); + } + if (cgraph->grad_accs) { + FATAL("Cannot pass cgraphs with grad_accs"); + } + if (cgraph->visited_hash_set.size || cgraph->visited_hash_set.used || cgraph->visited_hash_set.keys) { + FATAL("Cannot pass cgraphs with visited_hash_set"); + } + + if (!secondary_enc) { + return; + } + + size_t cgraph_size = sizeof(*cgraph); + vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size); } static inline ggml_cgraph * -vn_decode_ggml_cgraph(struct vn_cs_decoder *dec) { - UNUSED(dec); +vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) { + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph)); - return NULL; + if (!secondary_dec) { + return NULL; + } + return cgraph; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index 4a8214265209c..1ce0ad0280c29 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -10,7 +10,17 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); - vn_encode_ggml_cgraph(encoder, cgraph); + size_t size = vn_encode_sizeof_ggml_cgraph(cgraph); + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + vn_encode_size_t(encoder, &size); + + char *shmem_data = (char *) shmem->mmap_ptr; + struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, size); + + vn_encode_ggml_cgraph(encoder, cgraph, &secondary_enc); REMOTE_CALL(gpu, encoder, decoder); @@ -18,5 +28,7 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL_FINISH(gpu, encoder, decoder); + virtgpu_shmem_destroy(gpu, shmem->shmem); + return status; } From 372e6d06798e74132bc9d8d4752dd203a82da8f0 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 15:11:17 +0200 Subject: [PATCH 072/117] Continue the skeleton --- .../backend-dispatched-backend.cpp | 9 ++- .../ggml-remotingbackend/shared/venus_cs.h | 1 + .../shared/venus_cs_ggml.h | 62 ++++++++++++++++--- .../virtgpu-forward-backend.cpp | 8 ++- 4 files changed, 66 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index 60f5708528d12..72c01c7fa2777 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -13,19 +13,22 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); - + INFO("Receive shmem id %d", shmem_res_id); const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); if (!shmem_data) { FATAL("Couldn't get the shmem addr from virgl :/"); } size_t shmem_size; vn_decode_size_t(dec, &shmem_size); - + INFO("Receive shmem size %ld", shmem_size); struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size); ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec); - ggml_status status = bck->iface.graph_compute(bck, cgraph); + ggml_status status = GGML_STATUS_SUCCESS; + status = bck->iface.graph_compute(bck, cgraph); + + INFO("Send status %u", status); vn_encode_ggml_status(enc, &status); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index a780cb6e40fda..510cbd6fcce66 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -90,6 +90,7 @@ vn_cs_decoder_use_inplace(struct vn_cs_decoder *dec, return addr; } + /* * read/write */ diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 3a72b906a2634..e254228017531 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -86,6 +86,23 @@ vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { return tensor; } +/* tensor array */ + +static inline void +vn_encode_ggml_tensor_array(struct vn_cs_encoder *enc, ggml_tensor **addr, size_t count) +{ + size_t buffer_size = sizeof(*addr) * count; + vn_encode(enc, buffer_size, addr, buffer_size); +} + +static inline ggml_tensor ** +vn_decode_ggml_tensor_array_inplace(struct vn_cs_decoder *dec, size_t count) +{ + size_t buffer_size = sizeof(ggml_tensor*) * count; + + return (ggml_tensor **)(uintptr_t) vn_cs_decoder_use_inplace(dec, buffer_size); +} + /* *** ggml_backend_buffer_type_t *** */ // ggml_backend_buffer_type_t is a POINTER (to a struct). @@ -132,7 +149,7 @@ vn_decode_ggml_buffer(struct vn_cs_decoder *dec) { static inline void vn_encode_ggml_status(struct vn_cs_encoder *enc, const enum ggml_status *status) { - vn_cs_encoder_write(enc, sizeof(*status), &status, sizeof(*status)); + vn_cs_encoder_write(enc, sizeof(*status), status, sizeof(*status)); } static inline void @@ -155,8 +172,29 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id /* ggml_cgraph */ static inline size_t -vn_encode_sizeof_ggml_cgraph(ggml_cgraph *cgraph) { - return sizeof(*cgraph); +vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) { + /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */ + size_t size = 0; + + size += sizeof(ggml_tensor*) * cgraph->n_nodes; + + size_t tensor_size = sizeof(ggml_tensor); + INFO("tensor_size: %lu", tensor_size); + size += tensor_size * cgraph->n_nodes; + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *tensor = cgraph->nodes[i]; + if (tensor->buffer) { + size += sizeof(apir_buffer_handle_t); + } + if (tensor->view_src) { + size += tensor_size; + } + for (int j = 0; tensor->src[j]; j++) { + size += tensor_size; + } + } + return size; } static inline void @@ -180,12 +218,15 @@ vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_ FATAL("Cannot pass cgraphs with visited_hash_set"); } - if (!secondary_enc) { - return; - } - size_t cgraph_size = sizeof(*cgraph); vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size); + + vn_encode_ggml_tensor_array(secondary_enc, cgraph->nodes, cgraph->n_nodes); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *tensor = cgraph->nodes[i]; + vn_encode_ggml_tensor(secondary_enc, tensor); + } } static inline ggml_cgraph * @@ -194,8 +235,11 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary // modify the shared memory data to fix the `src` pointers. ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph)); - if (!secondary_dec) { - return NULL; + cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes); + + for (int i = 0; i < cgraph->n_nodes; i++) { + cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec); } + return cgraph; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index 1ce0ad0280c29..8d18c18f8e2b5 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -2,7 +2,6 @@ ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { - ggml_status status; UNUSED(cgraph); struct vn_cs_encoder *encoder; @@ -10,11 +9,14 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); - size_t size = vn_encode_sizeof_ggml_cgraph(cgraph); + size_t size = vn_encode_sizeof_ggml_cgraph_data(cgraph); struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); if (!shmem) { FATAL("Couldn't allocate the guest-host shared buffer :/"); } + INFO("Send shmem ID %d", shmem->res_id); + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); + INFO("Send shmem size %lu", size); vn_encode_size_t(encoder, &size); char *shmem_data = (char *) shmem->mmap_ptr; @@ -24,7 +26,9 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL(gpu, encoder, decoder); + ggml_status status = GGML_STATUS_ABORTED; vn_decode_ggml_status(decoder, &status); + INFO("Received status %u", status); REMOTE_CALL_FINISH(gpu, encoder, decoder); From 1a82665297f8d7760f74ec96f3c05292acf04166 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 15:44:40 +0200 Subject: [PATCH 073/117] remoting: recursively encode/decode the tensors --- .../backend-dispatched-buffer.cpp | 4 +- .../backend-dispatched-device.cpp | 2 +- .../shared/apir_backend.h | 5 + .../ggml-remotingbackend/shared/venus_cs.h | 7 +- .../shared/venus_cs_ggml.h | 107 ++++++++---------- .../src/ggml-remotingfrontend/ggml-remoting.h | 3 + .../virtgpu-forward-buffer.cpp | 4 +- .../virtgpu-forward-device.cpp | 3 +- 8 files changed, 69 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index 8dfce029af40e..d181937f55256 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -30,7 +30,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, ggml_tensor *tensor; // safe to remove the const qualifier here - tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec); + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR); uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); @@ -75,7 +75,7 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, ggml_tensor *tensor; // safe to remove the const qualifier here - tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec); + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR); uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index ba2ec479a95c0..72d159bb676b9 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -72,7 +72,7 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); - const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP); bool supports_op = dev->iface.supports_op(dev, op); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 2608909b7541e..14b0c21240547 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -70,3 +70,8 @@ struct virgl_apir_context { struct virgl_apir_callbacks iface; }; + +#define TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP 2 +#define TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR 2 +#define TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR 2 +#define TENSOR_MAX_DEPTH_CGRAPH_DATA 10 diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 510cbd6fcce66..fdfb498576347 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -105,7 +105,7 @@ vn_cs_decoder_read(struct vn_cs_decoder *dec, dec->cur += size; } -static inline void +static inline char * vn_cs_encoder_write(struct vn_cs_encoder *enc, size_t size, const void *val, @@ -114,9 +114,12 @@ vn_cs_encoder_write(struct vn_cs_encoder *enc, assert(val_size <= size); assert(size <= ((size_t) (enc->end - enc->cur))); + char *write_addr = enc->cur; /* we should not rely on the compiler to optimize away memcpy... */ - memcpy(enc->cur, val, val_size); + memcpy(write_addr, val, val_size); enc->cur += size; + + return write_addr; } /* diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index e254228017531..e4350971c76fa 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -10,24 +10,37 @@ vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle static inline ggml_backend_buffer_t vn_decode_ggml_buffer(struct vn_cs_decoder *dec); +/* ggml_tensor */ + +static inline size_t +vn_encode_sizeof_ggml_tensor(const ggml_tensor *tensor, int depth_to_go) { + /* must match the encoding vn_encode_ggml_tensor */ + size_t size = 0; + size_t tensor_size = sizeof(ggml_tensor); + + size += tensor_size; // the main tensor + + if (depth_to_go != 0) { + if (tensor->view_src) { + size += vn_encode_sizeof_ggml_tensor(tensor->view_src, depth_to_go-1); + } + + for (int i = 0; tensor->src[i]; i++) { + size += vn_encode_sizeof_ggml_tensor(tensor->src[i], depth_to_go-1); + } + } + return size; +} + static inline void -vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor, int depth_to_go) { size_t tensor_size = sizeof(*tensor); if (tensor->extra) { FATAL("Cannot pass tensors with extra"); } - if (tensor->src[0] && tensor->buffer) { - static int first = 1; - if (first) { - // not sure if the buffer needs to be updated inside the src tensors or not - WARNING("Cannot pass tensors with src and buffer"); - first = 0; - } - } - - vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); + ggml_tensor *cs_tensor = (ggml_tensor *) vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); // tensor->data is a pointer inside the device buffer. No need to touch it // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. @@ -35,52 +48,40 @@ vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { if (tensor->buffer) { apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); - vn_encode_ggml_buffer_handle(enc, &buffer_handle); + cs_tensor->buffer = (ggml_backend_buffer *) buffer_handle; } - if (tensor->view_src) { - vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); - } - - for (int i = 0; tensor->src[i]; i++) { - const ggml_tensor *tensor_src = tensor->src[i]; - vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size); + if (depth_to_go != 0) { + if (tensor->view_src) { + vn_encode_ggml_tensor(enc, tensor->view_src, depth_to_go-1); + } -#if 0 - if (tensor_src->buffer) { - apir_buffer_handle_t src_buffer_handle = ggml_buffer_to_apir_handle(tensor_src->buffer); - vn_encode_ggml_buffer_handle(enc, &src_buffer_handle); + for (int i = 0; tensor->src[i]; i++) { + vn_encode_ggml_tensor(enc, tensor->src[i], depth_to_go-1); } -#endif } } -static inline const ggml_tensor * -vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { +static inline ggml_tensor * +vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec, int depth_to_go) { // it safe to remove the `const` qualifier here, we *do* want to // modify the shared memory data to fix the `src` pointers. ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); // tensor->data is a pointer inside the device buffer. No need to touch it - // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. - if (tensor->buffer) { - tensor->buffer = vn_decode_ggml_buffer(dec); - } + // tensor->buffer has already been updated to the correct pointer - if (tensor->view_src) { - ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); - tensor->view_src = tensor_view_src; - } + if (depth_to_go != 0) { + if (tensor->view_src) { + ggml_tensor *tensor_view_src = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1); + tensor->view_src = tensor_view_src; + } - for (int i = 0; tensor->src[i]; i++) { - ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); - tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor -#if 0 - if (tensor_src->buffer) { - tensor_src->buffer = vn_decode_ggml_buffer(dec); + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor *tensor_src_i = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1); + tensor->src[i] = tensor_src_i; } -#endif } return tensor; @@ -176,24 +177,16 @@ vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) { /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */ size_t size = 0; - size += sizeof(ggml_tensor*) * cgraph->n_nodes; + // don't include the `ggml_cgraph`, only it's data - size_t tensor_size = sizeof(ggml_tensor); - INFO("tensor_size: %lu", tensor_size); - size += tensor_size * cgraph->n_nodes; + // include the array of tensors + size += sizeof(ggml_tensor*) * cgraph->n_nodes; + // include the size of all the tensors for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *tensor = cgraph->nodes[i]; - if (tensor->buffer) { - size += sizeof(apir_buffer_handle_t); - } - if (tensor->view_src) { - size += tensor_size; - } - for (int j = 0; tensor->src[j]; j++) { - size += tensor_size; - } + size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA); } + return size; } @@ -225,7 +218,7 @@ vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_ for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *tensor = cgraph->nodes[i]; - vn_encode_ggml_tensor(secondary_enc, tensor); + vn_encode_ggml_tensor(secondary_enc, tensor, TENSOR_MAX_DEPTH_CGRAPH_DATA); } } @@ -238,7 +231,7 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { - cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec); + cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec, TENSOR_MAX_DEPTH_CGRAPH_DATA); } return cgraph; diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index d51afbe19dc78..6a8bf2ea75713 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -71,6 +71,9 @@ struct ggml_backend_remoting_buffer_context { static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; + if (!context) { + return 0; + } return context->handle; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index f6c2ccb33b9e9..cfe89a19b761e 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -33,7 +33,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); - vn_encode_ggml_tensor(encoder, tensor); + vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR); struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); if (!shmem) { @@ -64,7 +64,7 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); - vn_encode_ggml_tensor(encoder, tensor); + vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR); struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); if (!shmem) { FATAL("Couldn't allocate the guest-host shared buffer :/"); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 7c241d71a1679..4c7b1da767068 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -140,8 +140,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); - vn_encode_ggml_tensor(encoder, op); - + vn_encode_ggml_tensor(encoder, op, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP); REMOTE_CALL(gpu, encoder, decoder); From 6ce806bc13b9fe875b9f199a18679d56d30a229e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 20 May 2025 17:57:42 +0200 Subject: [PATCH 074/117] keep working --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 1 + .../backend-dispatched-backend.cpp | 12 +- .../backend-dispatched-buffer.cpp | 9 +- .../backend-dispatched-device.cpp | 3 +- .../shared/venus_cs_ggml.h | 146 ++++-------------- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 1 + .../ggml-remotingfrontend/ggml-backend.cpp | 4 +- .../virtgpu-forward-buffer.cpp | 6 +- .../virtgpu-forward-device.cpp | 14 +- 9 files changed, 64 insertions(+), 132 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index 3d7255faf237f..f9a63ef60a445 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -14,6 +14,7 @@ ggml_add_backend_library(ggml-remotingbackend shared/api_remoting.h shared/apir_backend.h shared/venus_cs.h + shared/venus_cs_ggml-rpc.cpp ) target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index 72c01c7fa2777..61619f4c94f6b 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -13,22 +13,24 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); - INFO("Receive shmem id %d", shmem_res_id); + const void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); if (!shmem_data) { FATAL("Couldn't get the shmem addr from virgl :/"); } size_t shmem_size; vn_decode_size_t(dec, &shmem_size); - INFO("Receive shmem size %ld", shmem_size); + struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size); ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec); ggml_status status = GGML_STATUS_SUCCESS; - status = bck->iface.graph_compute(bck, cgraph); - - INFO("Send status %u", status); + if (false) { + status = bck->iface.graph_compute(bck, cgraph); + } else { + WARNING("SKIPPING backend->graph_compute()"); + } vn_encode_ggml_status(enc, &status); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index d181937f55256..70d86677d15b1 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -30,7 +30,7 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, ggml_tensor *tensor; // safe to remove the const qualifier here - tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR); + tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor(dec); uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); @@ -73,9 +73,10 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, ggml_backend_buffer_t buffer; buffer = vn_decode_ggml_buffer(dec); - ggml_tensor *tensor; + + const ggml_tensor *tensor; // safe to remove the const qualifier here - tensor = (ggml_tensor *) (uintptr_t) vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR); + tensor = vn_decode_ggml_tensor(dec); uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); @@ -91,8 +92,6 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, FATAL("Couldn't get the shmem addr from virgl :/"); } - INFO("GET_TENSOR"); - UNUSED(buffer); UNUSED(tensor); buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 72d159bb676b9..863c2698779e7 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -72,7 +72,8 @@ backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); - const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP); + + const ggml_tensor *op = vn_decode_ggml_tensor(dec); bool supports_op = dev->iface.supports_op(dev, op); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index e4350971c76fa..61f3a810ebc01 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -1,6 +1,8 @@ // needs the ggml-backend-impl.h definition // needs venus_cs.h definition +#include "venus_cs_ggml-rpc.h" + // needs // ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer); @@ -10,98 +12,43 @@ vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle static inline ggml_backend_buffer_t vn_decode_ggml_buffer(struct vn_cs_decoder *dec); -/* ggml_tensor */ - -static inline size_t -vn_encode_sizeof_ggml_tensor(const ggml_tensor *tensor, int depth_to_go) { - /* must match the encoding vn_encode_ggml_tensor */ - size_t size = 0; - size_t tensor_size = sizeof(ggml_tensor); - - size += tensor_size; // the main tensor - - if (depth_to_go != 0) { - if (tensor->view_src) { - size += vn_encode_sizeof_ggml_tensor(tensor->view_src, depth_to_go-1); - } - - for (int i = 0; tensor->src[i]; i++) { - size += vn_encode_sizeof_ggml_tensor(tensor->src[i], depth_to_go-1); - } - } - return size; -} +/* rpc_tensor */ static inline void -vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor, int depth_to_go) { - size_t tensor_size = sizeof(*tensor); - - if (tensor->extra) { - FATAL("Cannot pass tensors with extra"); - } - - ggml_tensor *cs_tensor = (ggml_tensor *) vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); - - // tensor->data is a pointer inside the device buffer. No need to touch it - // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. - // (could also make a copy of the tensor, and update locally.) - - if (tensor->buffer) { - apir_buffer_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); - cs_tensor->buffer = (ggml_backend_buffer *) buffer_handle; - } - - if (depth_to_go != 0) { - if (tensor->view_src) { - vn_encode_ggml_tensor(enc, tensor->view_src, depth_to_go-1); - } - - for (int i = 0; tensor->src[i]; i++) { - vn_encode_ggml_tensor(enc, tensor->src[i], depth_to_go-1); - } - } +vn_encode_rcp_tensor(struct vn_cs_encoder *enc, const rpc_tensor *rpc_tensor) { + size_t rpc_tensor_size = sizeof(*rpc_tensor); + vn_encode(enc, rpc_tensor_size, rpc_tensor, rpc_tensor_size); } -static inline ggml_tensor * -vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec, int depth_to_go) { - - // it safe to remove the `const` qualifier here, we *do* want to - // modify the shared memory data to fix the `src` pointers. - ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); - - // tensor->data is a pointer inside the device buffer. No need to touch it - // tensor->buffer has already been updated to the correct pointer +static inline rpc_tensor * +vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) { + size_t rpc_tensor_size = sizeof(rpc_tensor); - if (depth_to_go != 0) { - if (tensor->view_src) { - ggml_tensor *tensor_view_src = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1); - tensor->view_src = tensor_view_src; - } - - for (int i = 0; tensor->src[i]; i++) { - ggml_tensor *tensor_src_i = vn_decode_ggml_tensor_inplace(dec, depth_to_go-1); - tensor->src[i] = tensor_src_i; - } - } - - return tensor; + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); } -/* tensor array */ +/* ggml_tensor */ static inline void -vn_encode_ggml_tensor_array(struct vn_cs_encoder *enc, ggml_tensor **addr, size_t count) -{ - size_t buffer_size = sizeof(*addr) * count; - vn_encode(enc, buffer_size, addr, buffer_size); +vn_encode_ggml_tensor(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + rpc_tensor serialized = serialize_tensor(tensor); + + vn_encode_rcp_tensor(enc, &serialized); } -static inline ggml_tensor ** -vn_decode_ggml_tensor_array_inplace(struct vn_cs_decoder *dec, size_t count) -{ - size_t buffer_size = sizeof(ggml_tensor*) * count; +static inline const ggml_tensor * +vn_decode_ggml_tensor(struct vn_cs_decoder *dec) { + const rpc_tensor *rpc_tensor = vn_decode_rpc_tensor_inplace(dec); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); - return (ggml_tensor **)(uintptr_t) vn_cs_decoder_use_inplace(dec, buffer_size); + const ggml_tensor *tensor = deserialize_tensor(ctx, rpc_tensor); + + return tensor; } /* *** ggml_backend_buffer_type_t *** */ @@ -186,7 +133,7 @@ vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA); } - + INFO("SIZEOF(cgraph) --> %lu", size); return size; } @@ -194,45 +141,16 @@ static inline void vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) { UNUSED(enc); UNUSED(cgraph); - - if (cgraph->n_leafs) { - FATAL("Cannot pass cgraphs with leaves"); - } - if (cgraph->size) { - FATAL("Cannot pass cgraphs with size"); - } - if (cgraph->grads) { - FATAL("Cannot pass cgraphs with grads"); - } - if (cgraph->grad_accs) { - FATAL("Cannot pass cgraphs with grad_accs"); - } - if (cgraph->visited_hash_set.size || cgraph->visited_hash_set.used || cgraph->visited_hash_set.keys) { - FATAL("Cannot pass cgraphs with visited_hash_set"); - } - - size_t cgraph_size = sizeof(*cgraph); - vn_cs_encoder_write(enc, cgraph_size, cgraph, cgraph_size); - - vn_encode_ggml_tensor_array(secondary_enc, cgraph->nodes, cgraph->n_nodes); - - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *tensor = cgraph->nodes[i]; - vn_encode_ggml_tensor(secondary_enc, tensor, TENSOR_MAX_DEPTH_CGRAPH_DATA); - } + UNUSED(secondary_enc); } static inline ggml_cgraph * vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) { // it safe to remove the `const` qualifier here, we *do* want to // modify the shared memory data to fix the `src` pointers. - ggml_cgraph *cgraph = (ggml_cgraph *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_cgraph)); - - cgraph->nodes = vn_decode_ggml_tensor_array_inplace(secondary_dec, cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - cgraph->nodes[i] = (ggml_tensor *)(uintptr_t) vn_decode_ggml_tensor_inplace(secondary_dec, TENSOR_MAX_DEPTH_CGRAPH_DATA); - } + UNUSED(dec); + UNUSED(secondary_dec); - return cgraph; + return NULL; } diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index a13f48b7ef81b..e0b305fc26c3f 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -19,6 +19,7 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu-forward-backend.cpp virtgpu-forward-impl.h ../../include/ggml-remoting-frontend.h + ../ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp ) target_link_libraries(ggml-remotingfrontend PUBLIC drm) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 190fde76a30bc..05383ff99f0a5 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -20,7 +20,9 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g struct virtgpu *gpu = DEV_TO_GPU(backend->device); - BEING_IMPLEMENTED; + IMPLEMENTED; + + STOP_HERE; return apir_backend_graph_compute(gpu, cgraph); } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index cfe89a19b761e..1a95f2f4721e5 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -26,6 +26,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; + #if 0 INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", buffer_handle, tensor, data, offset, size); @@ -33,7 +34,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); - vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR); + vn_encode_ggml_tensor(encoder, tensor); struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); if (!shmem) { @@ -64,7 +65,8 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); - vn_encode_ggml_tensor(encoder, tensor, TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR); + vn_encode_ggml_tensor(encoder, tensor); + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); if (!shmem) { FATAL("Couldn't allocate the guest-host shared buffer :/"); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 4c7b1da767068..5ee2c01dd50ab 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -135,23 +135,29 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { +#if 1 + /* ggml-rpc cheats it like this */ + /* with the current implementation of serialize_tensor, the src/view aren't properly passed */ + UNUSED(gpu); + UNUSED(op); + + return true; +#else struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; - REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); - vn_encode_ggml_tensor(encoder, op, TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP); + vn_encode_ggml_tensor(encoder, op); REMOTE_CALL(gpu, encoder, decoder); bool supports_op; vn_decode_bool_t(decoder, &supports_op); - /* *** */ - REMOTE_CALL_FINISH(gpu, encoder, decoder); return supports_op; +#endif } apir_buffer_type_handle_t From 6fc0c88e029a499dc925aa764fa46071ef17c204 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 21 May 2025 14:11:36 +0200 Subject: [PATCH 075/117] start using the ggml-rpc serialization methods --- ggml/src/ggml-remotingbackend/CMakeLists.txt | 2 +- .../backend-dispatched-backend.cpp | 16 +- .../backend-dispatched-buffer-type.cpp | 4 + .../ggml-remotingbackend/shared/venus_cs.h | 6 + .../shared/venus_cs_ggml-rpc.cpp | 167 ++++++++++++++++++ .../shared/venus_cs_ggml-rpc.h | 43 +++++ .../shared/venus_cs_ggml.h | 48 ++--- .../venus_cs_ggml-rpc-back.cpp | 97 ++++++++++ ggml/src/ggml-remotingfrontend/CMakeLists.txt | 2 +- .../ggml-backend-buffer-type.cpp | 8 +- .../ggml-backend-device.cpp | 2 +- .../ggml-remotingfrontend/ggml-backend.cpp | 6 +- .../src/ggml-remotingfrontend/ggml-remoting.h | 3 + .../venus_cs_ggml-rpc-front.cpp | 84 +++++++++ .../virtgpu-forward-backend.cpp | 20 ++- .../ggml-remotingfrontend/virtgpu-utils.cpp | 5 + .../src/ggml-remotingfrontend/virtgpu-utils.h | 1 + 17 files changed, 461 insertions(+), 53 deletions(-) create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp create mode 100644 ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h create mode 100644 ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp create mode 100644 ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp diff --git a/ggml/src/ggml-remotingbackend/CMakeLists.txt b/ggml/src/ggml-remotingbackend/CMakeLists.txt index f9a63ef60a445..7e374d395f68c 100644 --- a/ggml/src/ggml-remotingbackend/CMakeLists.txt +++ b/ggml/src/ggml-remotingbackend/CMakeLists.txt @@ -14,7 +14,7 @@ ggml_add_backend_library(ggml-remotingbackend shared/api_remoting.h shared/apir_backend.h shared/venus_cs.h - shared/venus_cs_ggml-rpc.cpp + venus_cs_ggml-rpc-back.cpp ) target_compile_options(ggml-remotingbackend PRIVATE -std=c++20) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index 61619f4c94f6b..f34a5b8c4d645 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -18,19 +18,15 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru if (!shmem_data) { FATAL("Couldn't get the shmem addr from virgl :/"); } - size_t shmem_size; - vn_decode_size_t(dec, &shmem_size); + size_t cgraph_size; + vn_decode_size_t(dec, &cgraph_size); - struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, shmem_size); + struct vn_cs_decoder secondary_dec = vn_cs_new_decoder((const char *) shmem_data, cgraph_size); - ggml_cgraph *cgraph = vn_decode_ggml_cgraph(dec, &secondary_dec); + ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size); - ggml_status status = GGML_STATUS_SUCCESS; - if (false) { - status = bck->iface.graph_compute(bck, cgraph); - } else { - WARNING("SKIPPING backend->graph_compute()"); - } + ggml_status status; + status = bck->iface.graph_compute(bck, cgraph); vn_encode_ggml_status(enc, &status); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index f09592ea5df43..8c3349a367dfc 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -70,5 +70,9 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer; vn_encode_ggml_buffer_handle(enc, buffer_handle); + if (buffer) { + track_backend_buffer(buffer); + } + return 0; } diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index fdfb498576347..93af8fd287c81 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -205,6 +205,12 @@ vn_decode_uint64_t_array(struct vn_cs_decoder *dec, uint64_t *val, uint32_t coun vn_decode(dec, size, val, size); } +static inline const uint64_t * +vn_decode_uint64_t_array_inplace(struct vn_cs_decoder *dec, uint32_t count) +{ + return (uint64_t *)(uintptr_t) vn_cs_decoder_use_inplace(dec, count * sizeof(uint64_t)); +} + /* int32_t */ static inline size_t diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp new file mode 100644 index 0000000000000..196cd70958745 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp @@ -0,0 +1,167 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HANDLE(buffer); + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor->data); + ggml_set_name(result, tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h new file mode 100644 index 0000000000000..a50405a479221 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h @@ -0,0 +1,43 @@ +#include +#include +#include + +// ggml_tensor is serialized into rpc_tensor +struct rpc_tensor { + uint64_t id; + uint32_t type; + uint64_t buffer; + uint32_t ne[GGML_MAX_DIMS]; + uint32_t nb[GGML_MAX_DIMS]; + uint32_t op; + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + int32_t flags; + uint64_t src[GGML_MAX_SRC]; + uint64_t view_src; + uint64_t view_offs; + uint64_t data; + char name[GGML_MAX_NAME]; + + char padding[4]; +}; + +/* frontend */ + +rpc_tensor serialize_tensor(const ggml_tensor * tensor); + +void serialize_graph(const ggml_cgraph * cgraph, std::vector & output); + +/* backend */ + +void track_backend_buffer(ggml_backend_buffer_t buffer); + +void add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited); + +ggml_tensor *deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor); + +ggml_tensor *create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map); + +ggml_cgraph *deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 61f3a810ebc01..c32ac91650e4d 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -27,6 +27,13 @@ vn_decode_rpc_tensor_inplace(struct vn_cs_decoder *dec) { return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); } +static inline rpc_tensor * +vn_decode_rpc_tensor_array_inplace(struct vn_cs_decoder *dec, uint32_t n_tensors) { + size_t rpc_tensor_size = sizeof(rpc_tensor) * n_tensors; + + return (rpc_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, rpc_tensor_size); +} + /* ggml_tensor */ static inline void @@ -120,37 +127,30 @@ vn_decode_virtgpu_shmem_res_id(struct vn_cs_decoder *dec, uint32_t *shmem_res_id /* ggml_cgraph */ static inline size_t -vn_encode_sizeof_ggml_cgraph_data(ggml_cgraph *cgraph) { - /* must match the encoding of vn_encode_ggml_cgraph and vn_encode_ggml_tensor */ - size_t size = 0; +vn_serialize_ggml_cgraph(ggml_cgraph *cgraph, std::vector & cgraph_data) { + serialize_graph(cgraph, cgraph_data); - // don't include the `ggml_cgraph`, only it's data - - // include the array of tensors - size += sizeof(ggml_tensor*) * cgraph->n_nodes; - - // include the size of all the tensors - for (int i = 0; i < cgraph->n_nodes; i++) { - size += vn_encode_sizeof_ggml_tensor(cgraph->nodes[i], TENSOR_MAX_DEPTH_CGRAPH_DATA); - } - INFO("SIZEOF(cgraph) --> %lu", size); - return size; + return cgraph_data.size(); } static inline void -vn_encode_ggml_cgraph(struct vn_cs_encoder *enc, ggml_cgraph *cgraph, struct vn_cs_encoder *secondary_enc) { - UNUSED(enc); - UNUSED(cgraph); - UNUSED(secondary_enc); +vn_encode_cgraph_data(struct vn_cs_encoder *enc, std::vector & cgraph_data) { + size_t cgraph_size = cgraph_data.size(); + + vn_encode(enc, cgraph_size, cgraph_data.data(), cgraph_size); } static inline ggml_cgraph * -vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, struct vn_cs_decoder *secondary_dec) { - // it safe to remove the `const` qualifier here, we *do* want to - // modify the shared memory data to fix the `src` pointers. +vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) { + UNUSED(cgraph_size); + + uint32_t n_nodes; + vn_decode_uint32_t(dec, &n_nodes); + const uint64_t * nodes = vn_decode_uint64_t_array_inplace(dec, n_nodes); - UNUSED(dec); - UNUSED(secondary_dec); + uint32_t n_tensors; + vn_decode_uint32_t(dec, &n_tensors); + const rpc_tensor *tensors = vn_decode_rpc_tensor_array_inplace(dec, n_tensors); - return NULL; + return deserialize_graph(n_nodes, n_tensors, tensors, nodes); } diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp new file mode 100644 index 0000000000000..663160f48f061 --- /dev/null +++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "shared/venus_cs_ggml-rpc.h" + +std::unordered_set backend_buffers; + +void +track_backend_buffer(ggml_backend_buffer_t buffer) { + backend_buffers.insert(buffer); +} + +ggml_tensor * +deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, + tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = tensor->nb[i]; + } + result->buffer = reinterpret_cast(tensor->buffer); + if (result->buffer && backend_buffers.find(result->buffer) == backend_buffers.end()) { + printf("WARNING: BUFFER NOT FOUND | %p\n", (void *)result->buffer); + result->buffer = nullptr; + } + + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } + + result->op = (ggml_op) tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result->op_params[i] = tensor->op_params[i]; + } + result->flags = tensor->flags; + result->data = reinterpret_cast(tensor->data); + ggml_set_name(result, tensor->name); + return result; +} + +ggml_tensor * +create_node(uint64_t id, + struct ggml_context * ctx, + const std::unordered_map & tensor_ptrs, + std::unordered_map & tensor_map) { + if (id == 0) { + return nullptr; + } + if (tensor_map.find(id) != tensor_map.end()) { + return tensor_map[id]; + } + const rpc_tensor * tensor = tensor_ptrs.at(id); + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); + if (result == nullptr) { + return nullptr; + } + tensor_map[id] = result; + for (int i = 0; i < GGML_MAX_SRC; i++) { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + } + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + result->view_offs = tensor->view_offs; + return result; +} + +ggml_cgraph * +deserialize_graph(uint32_t n_nodes, uint32_t n_tensors, const rpc_tensor * tensors, const uint64_t * nodes) { + size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false); + graph->n_nodes = n_nodes; + std::unordered_map tensor_ptrs; + for (uint32_t i = 0; i < n_tensors; i++) { + tensor_ptrs[tensors[i].id] = &tensors[i]; + } + std::unordered_map tensor_map; + for (uint32_t i = 0; i < n_nodes; i++) { + int64_t id; + memcpy(&id, &nodes[i], sizeof(id)); + graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + } + + return graph; +} diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index e0b305fc26c3f..15b338f730176 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -19,7 +19,7 @@ ggml_add_backend_library(ggml-remotingfrontend virtgpu-forward-backend.cpp virtgpu-forward-impl.h ../../include/ggml-remoting-frontend.h - ../ggml-remotingbackend/shared/venus_cs_ggml-rpc.cpp + venus_cs_ggml-rpc-front.cpp ) target_link_libraries(ggml-remotingfrontend PUBLIC drm) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 631db50b309cc..6343ce50b88a3 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -16,12 +16,14 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->gpu = gpu; context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size); - return ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + + return buffer; } static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - //IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); @@ -59,7 +61,7 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_remoting_buffer_type_is_host, + /* .is_host = */ NULL, }; /****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 6d51643962d80..e1faad1a1f7fd 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -157,7 +157,7 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { /* .get_props = */ ggml_backend_remoting_device_get_props, /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .get_host_buffer_type = */ NULL, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 05383ff99f0a5..dacf0e3f1a597 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -3,7 +3,7 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); - IMPLEMENTED; + IMPLEMENTED_ONCE; return "API Remoting backend"; } @@ -20,9 +20,7 @@ static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, g struct virtgpu *gpu = DEV_TO_GPU(backend->device); - IMPLEMENTED; - - STOP_HERE; + IMPLEMENTED_ONCE; return apir_backend_graph_compute(gpu, cgraph); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 6a8bf2ea75713..1bb004a7cc961 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -38,6 +38,9 @@ #define STOP_HERE \ thks_bye() +#define BREAKPOINT \ + breakpoint() + #define IMPLEMENTED \ printf("INFO: ### reached implemented function %s\n", __func__) diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp new file mode 100644 index 0000000000000..d9b43f0222705 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml-rpc.h" + +#include "ggml-remoting.h" + +rpc_tensor +serialize_tensor(const ggml_tensor * tensor) { + rpc_tensor result; + result.id = reinterpret_cast(tensor); + result.type = tensor->type; + if (tensor->buffer) { + ggml_backend_buffer_t buffer = tensor->buffer; + + result.buffer = BUFFER_TO_HANDLE(buffer); + if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) { + INFO("pass buffer handle %p", result.buffer); + BREAKPOINT; + } + } else { + result.buffer = 0; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { + result.ne[i] = tensor->ne[i]; + result.nb[i] = tensor->nb[i]; + } + result.op = tensor->op; + for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { + result.op_params[i] = tensor->op_params[i]; + } + result.flags = tensor->flags; + for (uint32_t i = 0; i < GGML_MAX_SRC; i++) { + result.src[i] = reinterpret_cast(tensor->src[i]); + } + result.view_src = reinterpret_cast(tensor->view_src); + result.view_offs = tensor->view_offs; + result.data = reinterpret_cast(tensor->data); + snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); + return result; +} + +void +add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited) { + if (tensor == nullptr) { + return; + } + if (visited.find(tensor) != visited.end()) { + return; + } + visited.insert(tensor); + for (int i = 0; i < GGML_MAX_SRC; i++) { + add_tensor(tensor->src[i], tensors, visited); + } + add_tensor(tensor->view_src, tensors, visited); + tensors.push_back(serialize_tensor(tensor)); +} + +void +serialize_graph(const ggml_cgraph * cgraph, std::vector & output) { + uint32_t n_nodes = cgraph->n_nodes; + std::vector tensors; + std::unordered_set visited; + for (uint32_t i = 0; i < n_nodes; i++) { + add_tensor(cgraph->nodes[i], tensors, visited); + } + // serialization format: + // | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) | + uint32_t n_tensors = tensors.size(); + int output_size = sizeof(uint32_t) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t) + n_tensors * sizeof(rpc_tensor); + output.resize(output_size, 0); + memcpy(output.data(), &n_nodes, sizeof(n_nodes)); + for (uint32_t i = 0; i < n_nodes; i++) { + memcpy(output.data() + sizeof(n_nodes) + i * sizeof(uint64_t), &cgraph->nodes[i], sizeof(uint64_t)); + } + uint32_t * out_ntensors = (uint32_t *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t)); + *out_ntensors = n_tensors; + rpc_tensor * out_tensors = (rpc_tensor *)(output.data() + sizeof(n_nodes) + n_nodes * sizeof(uint64_t) + sizeof(uint32_t)); + memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor)); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index 8d18c18f8e2b5..51399edfd1dbc 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -9,26 +9,28 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE); - size_t size = vn_encode_sizeof_ggml_cgraph_data(cgraph); - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + std::vector cgraph_data; + size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data); + + struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, cgraph_size); if (!shmem) { - FATAL("Couldn't allocate the guest-host shared buffer :/"); + FATAL("Couldn't allocate the guest-host shared buffer for passing the cgraph :/"); } - INFO("Send shmem ID %d", shmem->res_id); + //INFO("Send shmem ID %d", shmem->res_id); vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); - INFO("Send shmem size %lu", size); - vn_encode_size_t(encoder, &size); + //INFO("Send shmem size %lu", cgraph_size); + vn_encode_size_t(encoder, &cgraph_size); char *shmem_data = (char *) shmem->mmap_ptr; - struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, size); + struct vn_cs_encoder secondary_enc = vn_cs_new_encoder(shmem_data, cgraph_size); - vn_encode_ggml_cgraph(encoder, cgraph, &secondary_enc); + vn_encode_cgraph_data(&secondary_enc, cgraph_data); REMOTE_CALL(gpu, encoder, decoder); ggml_status status = GGML_STATUS_ABORTED; vn_decode_ggml_status(decoder, &status); - INFO("Received status %u", status); + //INFO("Received status %u", status); REMOTE_CALL_FINISH(gpu, encoder, decoder); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp index cedd31ddaaf9c..833f0e4680103 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.cpp @@ -193,3 +193,8 @@ void thks_bye () { exit(0); } } + +void breakpoint() { + // break here + INFO("breakpoint here :)"); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index a6bd5df92ea6f..77a79ebb029ca 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -29,6 +29,7 @@ #define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE) void thks_bye(); +void breakpoint(); inline void INFO(const char *format, ...) { From c927b34323764635c613778abc4d38056d07e2b4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 21 May 2025 16:52:33 +0200 Subject: [PATCH 076/117] remoting: implement the free_buffer function --- .../backend-dispatched-buffer.cpp | 13 +++++++++++++ .../src/ggml-remotingbackend/backend-dispatched.h | 3 +++ .../ggml-remotingbackend/shared/apir_backend.h | 5 +++-- .../ggml-remotingfrontend/ggml-backend-buffer.cpp | 6 ++++-- .../virtgpu-forward-buffer.cpp | 15 +++++++++++++++ ggml/src/ggml-remotingfrontend/virtgpu-forward.h | 1 + 6 files changed, 39 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index 70d86677d15b1..782391f8ae4c1 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -114,3 +114,16 @@ backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struc return 0; } + +uint32_t +backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(enc); + + ggml_backend_buffer_t buffer; + buffer = vn_decode_ggml_buffer(dec); + + buffer->iface.free_buffer(buffer); + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index 5464f56baf152..d8d86fc3f67f5 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -40,6 +40,7 @@ uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder uint32_t backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_buffer_clear(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* backend */ uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); @@ -69,6 +70,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: return "backend_buffer_set_tensor"; case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: return "backend_buffer_get_tensor"; case APIR_COMMAND_TYPE_BUFFER_CLEAR: return "backend_buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: return "backend_buffer_free_buffer"; /* backend */ case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: return "backend_graph_compute"; @@ -99,6 +101,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_BUFFER_SET_TENSOR] = backend_buffer_set_tensor, [APIR_COMMAND_TYPE_BUFFER_GET_TENSOR] = backend_buffer_get_tensor, [APIR_COMMAND_TYPE_BUFFER_CLEAR] = backend_buffer_clear, + [APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER] = backend_buffer_free_buffer, /* backend */ [APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE] = backend_graph_compute, diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 14b0c21240547..1f39d063f8468 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -52,12 +52,13 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15, APIR_COMMAND_TYPE_BUFFER_CLEAR = 16, + APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 17, /* backend */ - APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 17, + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 18, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 18, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 19, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index ed2c749958279..5a5ead36a762e 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -76,9 +76,11 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffer) { UNUSED(buffer); - NOT_IMPLEMENTED; + IMPLEMENTED_ONCE; - STOP_HERE; + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + + apir_buffer_free_buffer(gpu, BUFFER_TO_HANDLE(buffer)); } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 1a95f2f4721e5..83f402bdd0dd4 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -99,3 +99,18 @@ apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_FINISH(gpu, encoder, decoder); } + + +void +apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER); + + vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + + REMOTE_CALL(gpu, encoder, decoder); + + REMOTE_CALL_FINISH(gpu, encoder, decoder); +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index d59cd754eb803..15885dfc12304 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -35,6 +35,7 @@ void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_han const ggml_tensor *tensor, void *data, size_t offset, size_t size); void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, uint8_t value); +void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle); /* backend */ From f29aa560e9b81c066889eb08546e786b2f3dd9a2 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 21 May 2025 17:13:53 +0200 Subject: [PATCH 077/117] remoting: highlight the hot path --- .../src/ggml-remotingfrontend/ggml-backend-device.cpp | 11 ++++++++--- ggml/src/ggml-remotingfrontend/ggml-backend.cpp | 2 +- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 2 +- run.remoting.sh | 3 ++- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index e1faad1a1f7fd..67294fcfdd5de 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -38,16 +38,21 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - IMPLEMENTED_ONCE; +#if 1 + UNUSED(dev); + UNUSED(op); + return true; // same as ggml-rpc +#else struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_supports_op(gpu, op); +#endif } static bool ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - IMPLEMENTED_ONCE; + //IMPLEMENTED_ONCE; #if 1 bool supported = buft->device == dev; @@ -66,7 +71,7 @@ ggml_backend_remoting_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ static bool ggml_backend_remoting_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - IMPLEMENTED_ONCE; + //IMPLEMENTED_ONCE; UNUSED(dev); UNUSED(op); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index dacf0e3f1a597..e4be758af84b3 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -3,7 +3,7 @@ static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); - IMPLEMENTED_ONCE; + //IMPLEMENTED_ONCE; return "API Remoting backend"; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 1bb004a7cc961..f5f51335563ea 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -49,7 +49,7 @@ static bool first = true; \ if (first) { \ printf("INFO: ### reached implemented function %s\n", __func__); \ - first = false; \ + first = true; \ } \ } while(0) diff --git a/run.remoting.sh b/run.remoting.sh index b7175a78aab4c..00253e8f818f1 100755 --- a/run.remoting.sh +++ b/run.remoting.sh @@ -7,7 +7,8 @@ else fi MODEL="$HOME/models/llama3.2" -PROMPT="say nothing" +#PROMPT="say nothing" +PROMPT="tell what's Apple metal API" $prefix \ ../build.remoting-frontend/bin/llama-run \ --ngl 99 \ From 14f327ab1489c624920b52083ea5c1d302073023 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 22 May 2025 09:45:51 +0200 Subject: [PATCH 078/117] remoting: fix the warnings and mute the debug logs when not in debug mode --- ggml/src/ggml-remotingbackend/shared/venus_cs.h | 9 +++++++++ .../ggml-remotingfrontend/ggml-backend-buffer.cpp | 2 +- .../ggml-backend-host-buffer-type.cpp | 2 +- ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp | 2 +- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 12 +++++++++++- ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp | 3 +++ ggml/src/ggml-remotingfrontend/virtgpu-utils.h | 5 +++++ ggml/src/ggml-remotingfrontend/virtgpu.cpp | 4 ++++ 8 files changed, 35 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 93af8fd287c81..2c8723fbfe1a6 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -165,6 +165,9 @@ static inline size_t vn_sizeof_uint64_t(const uint64_t *val) { assert(sizeof(*val) == 8); +#ifdef NDEBUG + UNUSED(val); +#endif return 8; } @@ -217,6 +220,9 @@ static inline size_t vn_sizeof_int32_t(const int32_t *val) { assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif return 4; } @@ -327,6 +333,9 @@ static inline size_t vn_sizeof_uint32_t(const uint32_t *val) { assert(sizeof(*val) == 4); +#ifdef NDEBUG + UNUSED(val); +#endif return 4; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 5a5ead36a762e..1f2db27c6c472 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -64,7 +64,7 @@ static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer } static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFFER_TO_GPU(buffer); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp index a355e9aebbbf9..20159faf3cae9 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -16,7 +16,7 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { } struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); - struct vn_renderer_shmem *shmem; + struct vn_renderer_shmem *shmem = nullptr; size_t index; for (size_t i = 0; i < device_ctx->shared_memory.size(); i++) { diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index ca98528fd7e08..055c9b0e10dbb 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -124,7 +124,7 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { /* .context = */ gpu, }; - RMT_LOG_DEBUG("ggml_backend_remoting_frontend_reg() hello :wave:"); + INFO("ggml_backend_remoting_frontend_reg() hello :wave:"); ggml_backend_remoting_reg_init_devices(®); diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index f5f51335563ea..e13d16b4ad799 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -41,17 +41,27 @@ #define BREAKPOINT \ breakpoint() +#ifndef NDEBUG #define IMPLEMENTED \ printf("INFO: ### reached implemented function %s\n", __func__) +#else +#define IMPLEMENTED \ + do {} while(0) +#endif +#ifndef NDEBUG #define IMPLEMENTED_ONCE \ do { \ static bool first = true; \ if (first) { \ printf("INFO: ### reached implemented function %s\n", __func__); \ - first = true; \ + first = false; \ } \ } while(0) +#else +#define IMPLEMENTED_ONCE \ + do {} while(0) +#endif #define RMT_LOG_DEBUG(msg) std::cerr << msg << std::endl diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp index 935b1028d2ab0..617702b8eca0c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.cpp @@ -43,6 +43,9 @@ virtgpu_ioctl_gem_close(struct virtgpu *gpu, uint32_t gem_handle) const int ret = virtgpu_ioctl(gpu, DRM_IOCTL_GEM_CLOSE, &args); assert(!ret); +#ifdef NDEBUG + UNUSED(ret); +#endif } static void * diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h index 77a79ebb029ca..6b69ebc6329ca 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-utils.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-utils.h @@ -31,6 +31,7 @@ void thks_bye(); void breakpoint(); +#ifndef NDEBUG inline void INFO(const char *format, ...) { va_list argptr; @@ -39,6 +40,10 @@ INFO(const char *format, ...) { fprintf(stderr, "\n"); va_end(argptr); } +#else +inline void +INFO(...) {} +#endif inline void WARNING(const char *format, ...) { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 58d70ddda28ff..be54353ed3b1c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -71,6 +71,10 @@ create_virtgpu() { result = virtgpu_init_context(gpu); assert(result == APIR_SUCCESS); +#ifdef NDEBUG + UNUSED(result); +#endif + virtgpu_init_shmem_blob_mem(gpu); gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384); From e80e48056c75e7ba128072fd56c7fcf7ac51f0ef Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 22 May 2025 09:46:13 +0200 Subject: [PATCH 079/117] scripts: make it easier to build and run in prod mode --- build.backend.sh | 17 +++++- .../src/ggml-remotingfrontend/virtgpu-shm.cpp | 4 +- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 3 +- prepare.backend.sh | 5 +- run.remoting.sh | 55 +++++++++++++++---- 5 files changed, 67 insertions(+), 17 deletions(-) diff --git a/build.backend.sh b/build.backend.sh index 086f7a4577ddd..863f98e3524a3 100755 --- a/build.backend.sh +++ b/build.backend.sh @@ -4,7 +4,22 @@ rm -f READY_backend FAILED_backend echo "int isatty(int fd) { return 1; }" | gcc -O2 -fpic -shared -ldl -o /tmp/isatty.so -xc - export LD_PRELOAD=/tmp/isatty.so -cmake --build ../build.remoting-backend --parallel 8 --target llama-run "$@" +if [[ "${PERF_MODE:-}" ]]; then + FLAVOR="-prod" +else + FLAVOR="" +fi + +if [[ "$FLAVOR" == "-prod" ]]; then + cat <base.mmap_ptr, shmem->base.mmap_size); - virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); + munmap(shmem->base.mmap_ptr, shmem->base.mmap_size); + virtgpu_ioctl_gem_close(gpu, shmem->gem_handle); } struct vn_renderer_shmem * diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index be54353ed3b1c..b595bb735a9f9 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -208,8 +208,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) drmFreeVersion(version); - if (VN_DEBUG(INIT)) - vn_log(gpu->instance, "using DRM device %s", node_path); + INFO(gpu->instance, "using DRM device %s", node_path); return APIR_SUCCESS; } diff --git a/prepare.backend.sh b/prepare.backend.sh index a51f2465b6733..76e30fe31cfa4 100755 --- a/prepare.backend.sh +++ b/prepare.backend.sh @@ -1,5 +1,6 @@ -cmake -S . -B ../build.remoting-backend \ +cmake -S . -B ../build.remoting-backend-prod \ -DGGML_REMOTINGBACKEND=ON \ -DGGML_NATIVE=OFF \ - -DCMAKE_BUILD_TYPE=Debug \ "$@" + +# -DCMAKE_BUILD_TYPE=Debug \ diff --git a/run.remoting.sh b/run.remoting.sh index 00253e8f818f1..9a2a77f054210 100755 --- a/run.remoting.sh +++ b/run.remoting.sh @@ -1,17 +1,52 @@ #! /bin/bash -clear +#clear if [[ ${1:-} == "gdb" ]]; then prefix="gdb --args" else prefix="" fi -MODEL="$HOME/models/llama3.2" -#PROMPT="say nothing" -PROMPT="tell what's Apple metal API" -$prefix \ - ../build.remoting-frontend/bin/llama-run \ - --ngl 99 \ - --verbose \ - "$MODEL" \ - "$PROMPT" +if [[ "${PERF_MODE:-}" ]]; then + FLAVOR="-prod" +else + FLAVOR="" +fi + +MODEL=${MODEL:-llama3.2} + +if [[ "$FLAVOR" == "-prod" ]]; then + cat < Date: Thu, 22 May 2025 11:27:53 +0200 Subject: [PATCH 080/117] remotingfrontend: always prepare a shared memory for data --- .../virtgpu-forward-backend.cpp | 18 +++++++--- .../virtgpu-forward-buffer.cpp | 33 ++++++++++++++----- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 11 +++++-- ggml/src/ggml-remotingfrontend/virtgpu.h | 1 + 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index 51399edfd1dbc..61c7fc7ac9839 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -12,10 +12,17 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { std::vector cgraph_data; size_t cgraph_size = vn_serialize_ggml_cgraph(cgraph, cgraph_data); - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, cgraph_size); - if (!shmem) { - FATAL("Couldn't allocate the guest-host shared buffer for passing the cgraph :/"); + struct vn_renderer_shmem *shmem; + if (cgraph_size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, cgraph_size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, cgraph_size, (int)cgraph_size/1024, (int)cgraph_size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; } + //INFO("Send shmem ID %d", shmem->res_id); vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); //INFO("Send shmem size %lu", cgraph_size); @@ -34,7 +41,8 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { REMOTE_CALL_FINISH(gpu, encoder, decoder); - virtgpu_shmem_destroy(gpu, shmem->shmem); - + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } return status; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 83f402bdd0dd4..f7c88a3634e87 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -36,9 +36,15 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); vn_encode_ggml_tensor(encoder, tensor); - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); - if (!shmem) { - FATAL("Couldn't allocate the guest-host shared buffer :/"); + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; } memcpy(shmem->mmap_ptr, data, size); @@ -51,7 +57,9 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_FINISH(gpu, encoder, decoder); - virtgpu_shmem_destroy(gpu, shmem->shmem); + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } return; } @@ -67,10 +75,17 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); vn_encode_ggml_tensor(encoder, tensor); - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); - if (!shmem) { - FATAL("Couldn't allocate the guest-host shared buffer :/"); + struct vn_renderer_shmem *shmem; + if (size > gpu->data_shmem->mmap_size) { + shmem = virtgpu_shmem_create(gpu, size); + WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + } else { + shmem = gpu->data_shmem; } + vn_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); vn_encode_size_t(encoder, &offset); vn_encode_size_t(encoder, &size); @@ -81,7 +96,9 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, REMOTE_CALL_FINISH(gpu, encoder, decoder); - virtgpu_shmem_destroy(gpu, shmem->shmem); + if (shmem != gpu->data_shmem) { + virtgpu_shmem_destroy(gpu, shmem->shmem); + } } void diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index b595bb735a9f9..ec9813815cc90 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -77,10 +77,15 @@ create_virtgpu() { virtgpu_init_shmem_blob_mem(gpu); - gpu->reply_shmem = virtgpu_shmem_create(gpu, 16384); + gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000); + gpu->data_shmem = virtgpu_shmem_create(gpu, 0x13b0000); // 19MiB if (!gpu->reply_shmem) { - FATAL("%s: failed to create the reply shared memory page :/", __func__); + FATAL("%s: failed to create the shared reply memory pages :/", __func__); + } + + if (!gpu->data_shmem) { + FATAL("%s: failed to create the shared data memory pages :/", __func__); } struct vn_cs_encoder *encoder; @@ -208,7 +213,7 @@ virtgpu_open_device(struct virtgpu *gpu, const drmDevicePtr dev) drmFreeVersion(version); - INFO(gpu->instance, "using DRM device %s", node_path); + INFO("using DRM device %s", node_path); return APIR_SUCCESS; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 5ab934ec7fb78..26933c8a6eda4 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -88,6 +88,7 @@ struct virtgpu { /* KP */ struct vn_renderer_shmem *reply_shmem; + struct vn_renderer_shmem *data_shmem; }; From 03935ee27c0cfc0bf0ca1a40bbe86939ee1e01f5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 22 May 2025 16:40:21 +0200 Subject: [PATCH 081/117] remoting: release device buffers on exit --- .../backend-dispatched-buffer-type.cpp | 1 + .../backend-dispatched-buffer.cpp | 5 +++++ .../ggml-remotingbackend/backend-dispatched.cpp | 4 ++++ ggml/src/ggml-remotingbackend/backend.cpp | 10 ++++++++++ .../ggml-remotingbackend/shared/apir_backend.h | 2 +- .../shared/venus_cs_ggml-rpc.h | 2 ++ .../venus_cs_ggml-rpc-back.cpp | 16 ++++++++++++++++ .../virtgpu-forward-buffer.cpp | 2 +- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 2 +- 9 files changed, 41 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 8c3349a367dfc..a796e9c1114a7 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -66,6 +66,7 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder size_t size; vn_decode_size_t(dec, &size); + WARNING("NEED TO ALLOCATE FROM PTR INSTEAD"); ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size); apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer; vn_encode_ggml_buffer_handle(enc, buffer_handle); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index 782391f8ae4c1..ea9f31ad1a634 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -123,6 +123,11 @@ backend_buffer_free_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, ggml_backend_buffer_t buffer; buffer = vn_decode_ggml_buffer(dec); + if (!untrack_backend_buffer(buffer)) { + WARNING("%s: unknown buffer %p", (void *) buffer); + return 1; + } + buffer->iface.free_buffer(buffer); return 0; diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index 73be488e6c0f7..6781e108200c2 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -35,5 +35,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba return APIR_BACKEND_INITIALIZE_BACKEND_FAILED; } + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); + return APIR_BACKEND_INITIALIZE_SUCCESSS; } diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index c9d784941d514..22a60681d4447 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -18,6 +18,16 @@ static void *backend_library_handle = NULL; extern "C" { void apir_backend_deinit(void) { + auto buffers = get_track_backend_buffers(); + for (const auto& buffer: buffers) { + untrack_backend_buffer(buffer); + buffer->iface.free_buffer(buffer); + } + + size_t free, total; + dev->iface.get_memory(dev, &free, &total); + WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); + if (backend_library_handle) { INFO("%s: The GGML backend library was loaded. Unloading it.", __func__); dlclose(backend_library_handle); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 1f39d063f8468..8ab79b4cbe39f 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -1,6 +1,6 @@ #pragma once -#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-remotingbackend.dylib" +#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend-prod/bin/libggml-remotingbackend.dylib" #define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize" #define APIR_DEINIT_FCT_NAME "apir_backend_deinit" #define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher" diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h index a50405a479221..96402287af7fc 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml-rpc.h @@ -30,6 +30,8 @@ void serialize_graph(const ggml_cgraph * cgraph, std::vector & output); /* backend */ void track_backend_buffer(ggml_backend_buffer_t buffer); +bool untrack_backend_buffer(ggml_backend_buffer_t buffer); +std::unordered_set get_track_backend_buffers(); void add_tensor(ggml_tensor * tensor, std::vector & tensors, std::unordered_set & visited); diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp index 663160f48f061..58a142ae93d5b 100644 --- a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp +++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp @@ -14,6 +14,22 @@ track_backend_buffer(ggml_backend_buffer_t buffer) { backend_buffers.insert(buffer); } +bool +untrack_backend_buffer(ggml_backend_buffer_t buffer) { + auto it = backend_buffers.find(buffer); + if (it == backend_buffers.end()) { + return false; + } + + backend_buffers.erase(it); + return true; +} + +std::unordered_set +get_track_backend_buffers() { + return backend_buffers; +} + ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index f7c88a3634e87..7452dd48ad4ea 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -39,7 +39,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, struct vn_renderer_shmem *shmem; if (size > gpu->data_shmem->mmap_size) { shmem = virtgpu_shmem_create(gpu, size); - WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); if (!shmem) { FATAL("Couldn't allocate the guest-host shared buffer :/"); } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index ec9813815cc90..39ed7b3a99f95 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -78,7 +78,7 @@ create_virtgpu() { virtgpu_init_shmem_blob_mem(gpu); gpu->reply_shmem = virtgpu_shmem_create(gpu, 0x4000); - gpu->data_shmem = virtgpu_shmem_create(gpu, 0x13b0000); // 19MiB + gpu->data_shmem = virtgpu_shmem_create(gpu, 0x1830000); // 24MiB if (!gpu->reply_shmem) { FATAL("%s: failed to create the shared reply memory pages :/", __func__); From 67d405d73da1d24d652e7d50ffb62d92381ea3d1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 26 May 2025 10:02:52 +0200 Subject: [PATCH 082/117] remoting: refactor the buffer context --- .../ggml-remotingbackend/backend-convert.h | 6 ++- .../backend-dispatched-buffer-type.cpp | 26 ++++++++++--- .../shared/apir_backend.h | 11 +++++- .../ggml-remotingbackend/shared/venus_cs.h | 10 ++--- .../shared/venus_cs_ggml.h | 9 +++-- .../ggml-backend-buffer-type.cpp | 2 +- .../ggml-backend-buffer.cpp | 10 ++--- .../src/ggml-remotingfrontend/ggml-remoting.h | 18 +++------ .../venus_cs_ggml-rpc-front.cpp | 2 +- .../virtgpu-forward-buffer-type.cpp | 31 ++++++++++----- .../virtgpu-forward-buffer.cpp | 38 +++++++++++++------ .../ggml-remotingfrontend/virtgpu-forward.h | 14 +++---- 12 files changed, 112 insertions(+), 65 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h index e7d875cde7ee8..4b56a222f02da 100644 --- a/ggml/src/ggml-remotingbackend/backend-convert.h +++ b/ggml/src/ggml-remotingbackend/backend-convert.h @@ -1,7 +1,9 @@ #include "shared/apir_backend.h" -static inline apir_buffer_handle_t +#define BUFFER_TO_HOST_HANDLE(name) ggml_buffer_to_apir_handle(name) + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { // in the backend, the buffer handle is the buffer pointer - return (apir_buffer_handle_t) buffer; + return (apir_buffer_host_handle_t) buffer; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index a796e9c1114a7..0f577da1f7711 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -60,16 +60,32 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); +#if APIR_ALLOC_FROM_HOST_PTR + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_data) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } +#else ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buft(dec); - +#endif size_t size; vn_decode_size_t(dec, &size); - WARNING("NEED TO ALLOCATE FROM PTR INSTEAD"); - ggml_backend_buffer_t buffer = buft->iface.alloc_buffer(buft, size); - apir_buffer_handle_t *buffer_handle = (apir_buffer_handle_t *) buffer; - vn_encode_ggml_buffer_handle(enc, buffer_handle); + ggml_backend_buffer_t buffer; +#if APIR_ALLOC_FROM_HOST_PTR + WARNING("USING FROM_HOST_PTR\n\n"); + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, size); +#else + WARNING("USING ALLOC_BUFFER"); + buffer = buft->iface.alloc_buffer(buft, size); + WARNING("USING ALLOC_BUFFER--> %p", buffer); +#endif + + vn_encode_ggml_buffer(enc, buffer); if (buffer) { track_backend_buffer(buffer); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 8ab79b4cbe39f..c9d1b71af95b9 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,8 +14,17 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 +#define APIR_ALLOC_FROM_HOST_PTR 0 + typedef uintptr_t apir_buffer_type_handle_t; -typedef uintptr_t apir_buffer_handle_t; +typedef uintptr_t apir_buffer_host_handle_t; + +typedef struct { + apir_buffer_host_handle_t host_handle; +#if APIR_ALLOC_FROM_HOST_PTR + struct vn_renderer_shmem *shmem; +#endif +} apir_buffer_context_t; typedef uint32_t (*apir_backend_initialize_t)(void); typedef void (*apir_backend_deinit_t)(void); diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 2c8723fbfe1a6..81dd5b8fb17ca 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -525,18 +525,18 @@ vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_ vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); } -/* apir_buffer_handle_t */ +/* apir_buffer_host_handle_t */ static inline void -vn_encode_apir_buffer_handle_t(struct vn_cs_encoder *enc, const apir_buffer_handle_t *val) +vn_encode_apir_buffer_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *val) { - vn_encode(enc, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t)); + vn_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); } static inline void -vn_decode_apir_buffer_handle_t(struct vn_cs_decoder *dec, apir_buffer_handle_t *val) +vn_decode_apir_buffer_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_host_handle_t *val) { - vn_decode(dec, sizeof(apir_buffer_handle_t), val, sizeof(apir_buffer_handle_t)); + vn_decode(dec, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); } /* uintptr_t */ diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index c32ac91650e4d..e0844113d9eb0 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -4,10 +4,10 @@ #include "venus_cs_ggml-rpc.h" // needs -// ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer); +// ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); static inline void -vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle); +vn_encode_ggml_buffer_host_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle); static inline ggml_backend_buffer_t vn_decode_ggml_buffer(struct vn_cs_decoder *dec); @@ -86,8 +86,9 @@ vn_decode_ggml_buft(struct vn_cs_decoder *dec) { // same logic as for ggml_backend_buffer_type_t static inline void -vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_handle_t *handle) { - vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +vn_encode_ggml_buffer(struct vn_cs_encoder *enc, const ggml_backend_buffer_t buffer) { + apir_buffer_host_handle_t handle = BUFFER_TO_HOST_HANDLE(buffer); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); } static inline ggml_backend_buffer_t diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 6343ce50b88a3..775238d501374 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -14,7 +14,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, } context->gpu = gpu; - context->handle = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 1f2db27c6c472..99bdbdaca2275 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -8,7 +8,7 @@ static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - return apir_buffer_get_base(gpu, BUFFER_TO_HANDLE(buffer)); + return apir_buffer_get_base(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -38,7 +38,7 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer } INFO("\n"); #endif - apir_buffer_set_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size); + apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); return; } @@ -47,7 +47,7 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - apir_buffer_get_tensor(gpu, BUFFER_TO_HANDLE(buffer), tensor, data, offset, size); + apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); } @@ -68,7 +68,7 @@ static void ggml_backend_remoting_buffer_clear(ggml_backend_buffer_t buffer, uin struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - apir_buffer_clear(gpu, BUFFER_TO_HANDLE(buffer), value); + apir_buffer_clear(gpu, BUFFER_TO_APIR_CONTEXT(buffer), value); return; } @@ -80,7 +80,7 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - apir_buffer_free_buffer(gpu, BUFFER_TO_HANDLE(buffer)); + apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index e13d16b4ad799..0ffee92f0ec8a 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -13,8 +13,11 @@ #define DEV_TO_GPU(name) \ ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu -#define BUFFER_TO_HANDLE(name) \ - ((struct ggml_backend_remoting_buffer_context *) (name)->context)->handle +#define BUFFER_TO_APIR_CONTEXT(name) \ + &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context + +#define BUFFER_TO_HOST_HANDLE(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context.host_handle #define GET_DEVICE_CONTEXT() \ (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \ @@ -76,20 +79,11 @@ struct ggml_backend_remoting_device_context { }; struct ggml_backend_remoting_buffer_context { - apir_buffer_handle_t handle; + apir_buffer_context_t apir_context; struct virtgpu *gpu; }; -static inline apir_buffer_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { - struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; - - if (!context) { - return 0; - } - return context->handle; -} - extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp index d9b43f0222705..bc4b96b84f365 100644 --- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -17,7 +17,7 @@ serialize_tensor(const ggml_tensor * tensor) { if (tensor->buffer) { ggml_backend_buffer_t buffer = tensor->buffer; - result.buffer = BUFFER_TO_HANDLE(buffer); + result.buffer = BUFFER_TO_HOST_HANDLE(buffer); if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) { INFO("pass buffer handle %p", result.buffer); BREAKPOINT; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index 645780715a133..f43c1851da797 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -8,7 +8,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_handle_t(encoder, &handle); + vn_encode_apir_buffer_type_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -36,7 +36,7 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_handle_t(encoder, &handle); + vn_encode_apir_buffer_type_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -58,7 +58,7 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_handle_t(encoder, &handle); + vn_encode_apir_buffer_type_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -80,7 +80,7 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_handle_t(encoder, &handle); + vn_encode_apir_buffer_type_handle_t(encoder, &handle); REMOTE_CALL(gpu, encoder, decoder); @@ -94,26 +94,37 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { return is_host; } -apir_buffer_handle_t +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buft, size_t size) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; + apir_buffer_context_t buffer_context; INFO("%s: allocate device memory (%lu)", __func__, size); REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); - apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_handle_t(encoder, &buft_handle); +#if APIR_ALLOC_FROM_HOST_PTR + UNUSED(buft); + + buffer_context.shmem = virtgpu_shmem_create(gpu, size); + //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); + if (!buffer_context.shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); +#else + apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context; + vn_encode_apir_buffer_type_handle_t(encoder, &buft_handle); +#endif vn_encode_size_t(encoder, &size); REMOTE_CALL(gpu, encoder, decoder); - apir_buffer_handle_t buffer_handle; - vn_decode_apir_buffer_handle_t(decoder, &buffer_handle); + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); REMOTE_CALL_FINISH(gpu, encoder, decoder); - return buffer_handle; + return buffer_context; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 7452dd48ad4ea..18b010583fa6d 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -1,13 +1,13 @@ #include "virtgpu-forward-impl.h" void * -apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { +apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_BASE); - vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); REMOTE_CALL(gpu, encoder, decoder); @@ -22,18 +22,18 @@ apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { } void -apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; #if 0 INFO("Calling (%p)->set_tensor(tensor=%p, data=%p, offset=%lu, size=%lu", - buffer_handle, tensor, data, offset, size); + buffer_context->host_handle, tensor, data, offset, size); #endif REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_SET_TENSOR); - vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); vn_encode_ggml_tensor(encoder, tensor); struct vn_renderer_shmem *shmem; @@ -64,15 +64,26 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, return; } +#if APIR_ALLOC_FROM_HOST_PTR void -apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, + const ggml_tensor *tensor, void *data, size_t offset, size_t size) { + UNUSED(gpu); + UNUSED(tensor); + char *buffer_base_addr = (char *) buffer_context->shmem->mmap_ptr; + + memcpy(data, buffer_base_addr+offset, size); +} +#else +void +apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_GET_TENSOR); - vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); vn_encode_ggml_tensor(encoder, tensor); struct vn_renderer_shmem *shmem; @@ -100,16 +111,17 @@ apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, virtgpu_shmem_destroy(gpu, shmem->shmem); } } +#endif void -apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, uint8_t value) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_CLEAR); - vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); vn_encode_uint8_t(encoder, &value); REMOTE_CALL(gpu, encoder, decoder); @@ -119,15 +131,17 @@ apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, void -apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle) { +apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER); - vn_encode_apir_buffer_handle_t(encoder, &buffer_handle); + vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); REMOTE_CALL(gpu, encoder, decoder); - +#if APIR_ALLOC_FROM_HOST_PTR + virtgpu_shmem_destroy(gpu, buffer_context->shmem->shmem); +#endif REMOTE_CALL_FINISH(gpu, encoder, decoder); } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 15885dfc12304..0429c5b757a18 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -23,19 +23,19 @@ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_t size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); bool apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); -apir_buffer_handle_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size); +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t buffer_buft, size_t size); /* buffer */ -void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle); -enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, ggml_tensor *tensor); -void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +void *apir_buffer_get_base(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); +enum ggml_status apir_buffer_init_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor); +void apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, ggml_tensor *tensor, const void *data, size_t offset, size_t size); -void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, const ggml_tensor *tensor, void *data, size_t offset, size_t size); -void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle, +void apir_buffer_clear(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, uint8_t value); -void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_handle_t buffer_handle); +void apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_context); /* backend */ From c5d44f95c00614fab4a79cd5c4d016ee7cdcdfa5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 26 May 2025 11:40:49 +0200 Subject: [PATCH 083/117] remoting: exchange more data --- .../ggml-remotingbackend/backend-convert.h | 6 ++++++ .../backend-dispatched-buffer-type.cpp | 15 +++++++------ .../backend-dispatched-device.cpp | 3 +-- .../shared/apir_backend.h | 5 +++-- .../ggml-remotingbackend/shared/venus_cs.h | 10 ++++----- .../shared/venus_cs_ggml.h | 18 ++++++++++++---- .../ggml-backend-device.cpp | 2 +- .../src/ggml-remotingfrontend/ggml-remoting.h | 6 ++++++ .../virtgpu-forward-buffer-type.cpp | 21 +++++++++---------- .../virtgpu-forward-device.cpp | 6 +++--- .../ggml-remotingfrontend/virtgpu-forward.h | 2 +- 11 files changed, 59 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-convert.h b/ggml/src/ggml-remotingbackend/backend-convert.h index 4b56a222f02da..b45c2784160ac 100644 --- a/ggml/src/ggml-remotingbackend/backend-convert.h +++ b/ggml/src/ggml-remotingbackend/backend-convert.h @@ -7,3 +7,9 @@ ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { // in the backend, the buffer handle is the buffer pointer return (apir_buffer_host_handle_t) buffer; } + +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 0f577da1f7711..9ff2e79831f87 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -10,7 +10,7 @@ uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); ggml_backend_buffer_type_t buft; - buft = vn_decode_ggml_buft(dec); + buft = vn_decode_ggml_buffer_type(dec); const char *string = buft->iface.get_name(buft); @@ -25,7 +25,7 @@ uint32_t backend_buffer_type_get_alignment(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); ggml_backend_buffer_type_t buft; - buft = vn_decode_ggml_buft(dec); + buft = vn_decode_ggml_buffer_type(dec); size_t value = buft->iface.get_alignment(buft); vn_encode_size_t(enc, &value); @@ -37,7 +37,7 @@ uint32_t backend_buffer_type_get_max_size(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); ggml_backend_buffer_type_t buft; - buft = vn_decode_ggml_buft(dec); + buft = vn_decode_ggml_buffer_type(dec); size_t value = buft->iface.get_max_size(buft); vn_encode_size_t(enc, &value); @@ -49,7 +49,7 @@ uint32_t backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); ggml_backend_buffer_type_t buft; - buft = vn_decode_ggml_buft(dec); + buft = vn_decode_ggml_buffer_type(dec); bool is_host = buft->iface.is_host(buft); vn_encode_bool_t(enc, &is_host); @@ -70,7 +70,7 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder } #else ggml_backend_buffer_type_t buft; - buft = vn_decode_ggml_buft(dec); + buft = vn_decode_ggml_buffer_type(dec); #endif size_t size; vn_decode_size_t(dec, &size); @@ -78,7 +78,10 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder ggml_backend_buffer_t buffer; #if APIR_ALLOC_FROM_HOST_PTR WARNING("USING FROM_HOST_PTR\n\n"); - buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, size); + #define MAX_TENSOR_SIZE 323205120 + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE); + + vn_encode_ggml_buffer_type(enc, buffer->buft); #else WARNING("USING ALLOC_BUFFER"); buffer = buft->iface.alloc_buffer(buft, size); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 863c2698779e7..18f0e0a81b6a6 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -89,8 +89,7 @@ backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder * ggml_backend_buffer_type_t bufft = dev->iface.get_buffer_type(dev); - apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) bufft; - vn_encode_apir_buffer_type_handle_t(enc, &buft_handle); + vn_encode_ggml_buffer_type(enc, bufft); return 0; } diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index c9d1b71af95b9..6449ccc109146 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,15 +14,16 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 -#define APIR_ALLOC_FROM_HOST_PTR 0 +#define APIR_ALLOC_FROM_HOST_PTR 1 -typedef uintptr_t apir_buffer_type_handle_t; +typedef uintptr_t apir_buffer_type_host_handle_t; typedef uintptr_t apir_buffer_host_handle_t; typedef struct { apir_buffer_host_handle_t host_handle; #if APIR_ALLOC_FROM_HOST_PTR struct vn_renderer_shmem *shmem; + apir_buffer_type_host_handle_t buft_host_handle; #endif } apir_buffer_context_t; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs.h b/ggml/src/ggml-remotingbackend/shared/venus_cs.h index 81dd5b8fb17ca..e67c99a46b5b6 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs.h @@ -511,18 +511,18 @@ vn_decode_bool_t(struct vn_cs_decoder *dec, bool *val) vn_decode(dec, sizeof(int), val, sizeof(bool)); } -/* apir_buffer_type_handle_t */ +/* apir_buffer_type_host_handle_t */ static inline void -vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_handle_t *val) +vn_encode_apir_buffer_type_host_handle_t(struct vn_cs_encoder *enc, const apir_buffer_type_host_handle_t *val) { - vn_encode(enc, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); + vn_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } static inline void -vn_decode_apir_buffer_type_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_handle_t *val) +vn_decode_apir_buffer_type_host_handle_t(struct vn_cs_decoder *dec, apir_buffer_type_host_handle_t *val) { - vn_decode(dec, sizeof(apir_buffer_type_handle_t), val, sizeof(apir_buffer_type_handle_t)); + vn_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } /* apir_buffer_host_handle_t */ diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index e0844113d9eb0..71e15f847e851 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -67,19 +67,29 @@ vn_decode_ggml_tensor(struct vn_cs_decoder *dec) { static inline void -vn_encode_apir_buffer_type_handle_t(struct vn_cs_encoder *enc, apir_buffer_type_handle_t *handle) { - vn_cs_encoder_write(enc, sizeof(*handle), handle, sizeof(*handle)); +vn_encode_ggml_buffer_type(struct vn_cs_encoder *enc, ggml_backend_buffer_type_t buft) { + apir_buffer_type_host_handle_t handle = ggml_buffer_type_to_apir_handle(buft); + vn_cs_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); } static inline ggml_backend_buffer_type_t -vn_decode_ggml_buft(struct vn_cs_decoder *dec) { - apir_buffer_type_handle_t handle; +vn_decode_ggml_buffer_type(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); return (ggml_backend_buffer_type_t) handle; } +static inline apir_buffer_type_host_handle_t +vn_decode_apir_buffer_type_host_handle(struct vn_cs_decoder *dec) { + apir_buffer_type_host_handle_t handle; + + vn_cs_decoder_read(dec, sizeof(handle), &handle, sizeof(handle)); + + return handle; +} + /* *** ggml_backend_type_t *** */ // ggml_backend_buffer_t is a POINTER. diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 67294fcfdd5de..092c05b9e43f3 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -118,7 +118,7 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { struct virtgpu *gpu = DEV_TO_GPU(dev); - apir_buffer_type_handle_t ctx = apir_device_get_buffer_type(gpu); + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); static struct ggml_backend_buffer_type buft { /* .iface = */ ggml_backend_remoting_buffer_type_interface, diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 0ffee92f0ec8a..71708e75a5d4a 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -22,6 +22,12 @@ #define GET_DEVICE_CONTEXT() \ (struct ggml_backend_remoting_device_context *) ggml_backend_remoting_get_device(0)->context \ +static inline apir_buffer_type_host_handle_t +ggml_buffer_type_to_apir_handle(ggml_backend_buffer_type_t buft) { + // in the backend, the buffer handle is the buffer pointer + return (apir_buffer_type_host_handle_t) buft->context; +} + #define NOT_IMPLEMENTED \ do { \ static bool first = true; \ diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index f43c1851da797..4f7aac1360124 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -7,8 +7,7 @@ apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); - apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_type_handle_t(encoder, &handle); + vn_encode_ggml_buffer_type(encoder, buft); REMOTE_CALL(gpu, encoder, decoder); @@ -35,8 +34,7 @@ apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t b REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); - apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_type_handle_t(encoder, &handle); + vn_encode_ggml_buffer_type(encoder, buft); REMOTE_CALL(gpu, encoder, decoder); @@ -57,8 +55,7 @@ apir_buffer_type_get_max_size(struct virtgpu *gpu, ggml_backend_buffer_type_t bu REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); - apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_type_handle_t(encoder, &handle); + vn_encode_ggml_buffer_type(encoder, buft); REMOTE_CALL(gpu, encoder, decoder); @@ -79,8 +76,7 @@ apir_buffer_type_is_host(struct virtgpu *gpu, ggml_backend_buffer_type_t buft) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); - apir_buffer_type_handle_t handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_type_handle_t(encoder, &handle); + vn_encode_ggml_buffer_type(encoder, buft); REMOTE_CALL(gpu, encoder, decoder); @@ -115,15 +111,18 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); #else - apir_buffer_type_handle_t buft_handle = (apir_buffer_type_handle_t) buft->context; - vn_encode_apir_buffer_type_handle_t(encoder, &buft_handle); + vn_encode_ggml_buffer_type(encoder, buft); #endif vn_encode_size_t(encoder, &size); REMOTE_CALL(gpu, encoder, decoder); - vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); +#if APIR_ALLOC_FROM_HOST_PTR + buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); +#endif + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + REMOTE_CALL_FINISH(gpu, encoder, decoder); return buffer_context; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 5ee2c01dd50ab..ffc6febf4cab0 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -160,7 +160,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { #endif } -apir_buffer_type_handle_t +apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu) { struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; @@ -169,8 +169,8 @@ apir_device_get_buffer_type(struct virtgpu *gpu) { REMOTE_CALL(gpu, encoder, decoder); - apir_buffer_type_handle_t buft_handle; - vn_decode_apir_buffer_type_handle_t(decoder, &buft_handle); + apir_buffer_type_host_handle_t buft_handle; + vn_decode_apir_buffer_type_host_handle_t(decoder, &buft_handle); REMOTE_CALL_FINISH(gpu, encoder, decoder); diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 0429c5b757a18..1f03f8bf31725 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -11,7 +11,7 @@ const char *apir_device_get_description(struct virtgpu *gpu); uint32_t apir_device_get_type(struct virtgpu *gpu); void apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total); bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op); -apir_buffer_type_handle_t apir_device_get_buffer_type(struct virtgpu *gpu); +apir_buffer_type_host_handle_t apir_device_get_buffer_type(struct virtgpu *gpu); void apir_device_get_props(struct virtgpu *gpu, bool *async, bool *host_buffer, From 83596a25899800d89152b244c462d52cca172936 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 26 May 2025 11:41:27 +0200 Subject: [PATCH 084/117] podman_compile: pass the PERF_MODE flag to the container --- podman_compile.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/podman_compile.sh b/podman_compile.sh index 4793b4ce20fa2..ec243f75ee89f 100755 --- a/podman_compile.sh +++ b/podman_compile.sh @@ -29,6 +29,7 @@ podman run \ --cgroupns host \ --security-opt label=disable \ --env HOME="$HOME" \ +--env PERF_MODE="${PERF_MODE:-}" \ -v "$HOME":"$HOME":Z \ -w "$PWD" \ -it --rm \ From 6b4bc18b295ee892a30228566392e33083a14644 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:06:05 +0200 Subject: [PATCH 085/117] examples: run: run: measure the generation throughput --- examples/run/run.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 68e94b0b3c3f8..7c830255d0cee 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -965,6 +965,36 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st response += piece; } +static long long timer_start = 0; +static long long timer_total = 0; +static long long timer_count = 0; + +static inline void start_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer_total += (timer_end - timer_start); + timer_count += 1; +} + +static void show_timer(void) { + //printe("[%15lld] ns\n", timer_total); + long long ms = timer_total/1000000; + long long itl = ms/timer_count; + float speed = 1/((float)itl) * 1000; + printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed); + + printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000); +} + + // helper function to evaluate a prompt and generate a response static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get()); @@ -974,10 +1004,15 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str return 1; } + int cr = atexit(show_timer); + assert(cr == 0); + // prepare a batch for the prompt llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; + while (true) { + start_timer(); check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { printe("failed to decode\n"); @@ -999,6 +1034,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str // prepare the next batch with the sampled token batch = llama_batch_get_one(&new_token_id, 1); + stop_timer(); } printf(LOG_COL_DEFAULT); From 9ab699dbd803af48ad3d54457737be3e9ecf6bd1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:07:02 +0200 Subject: [PATCH 086/117] examples: run: run: stop after 25 tokens --- examples/run/run.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 7c830255d0cee..42db6ef659980 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1011,7 +1011,14 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; + int count = 0; while (true) { +#if 0 + if (count > 25) { + printe("WARNING: stopping after %d tokens", count); + break; + } +#endif start_timer(); check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { @@ -1035,6 +1042,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str // prepare the next batch with the sampled token batch = llama_batch_get_one(&new_token_id, 1); stop_timer(); + count += 1; } printf(LOG_COL_DEFAULT); From da8bdd487e4e054fe77fdd228f584c557efc7ee8 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:08:50 +0200 Subject: [PATCH 087/117] remoting: add basic timing measurements --- .../backend-dispatched-backend.cpp | 6 ++++ .../backend-dispatched.cpp | 4 +++ ggml/src/ggml-remotingbackend/backend.cpp | 9 ++++++ .../shared/apir_backend.h | 31 ++++++++++++++++--- .../ggml-backend-reg.cpp | 12 +++++++ .../virtgpu-forward-backend.cpp | 15 +++++++-- .../ggml-remotingfrontend/virtgpu-forward.h | 2 ++ 7 files changed, 73 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index f34a5b8c4d645..cf416156c483b 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -6,11 +6,15 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "shared/apir_backend.h" + uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); UNUSED(enc); + start_timer(); + uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); @@ -30,5 +34,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru vn_encode_ggml_status(enc, &status); + stop_timer(); + return 0; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index 6781e108200c2..6038698fa9c05 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -12,6 +12,10 @@ ggml_backend_reg_t reg = NULL; ggml_backend_dev_t dev = NULL; ggml_backend_t bck = NULL; +long long timer_start = 0; +long long timer_total = 0; +long long timer_count = 0; + uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_backend_init_fct_p) { if (reg != NULL) { FATAL("%s: already initialized :/", __func__); diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 22a60681d4447..6eab34acfccdc 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -28,6 +28,10 @@ extern "C" { dev->iface.get_memory(dev, &free, &total); WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); + show_timer(); + + /* *** */ + if (backend_library_handle) { INFO("%s: The GGML backend library was loaded. Unloading it.", __func__); dlclose(backend_library_handle); @@ -91,6 +95,11 @@ extern "C" { return APIR_BACKEND_FORWARD_INDEX_INVALID; } +#if 0 + static long long count = 0; + INFO("[%lld] Calling %s", count, backend_dispatch_command_name((ApirBackendCommandType) cmd_type)); + count += 1; +#endif backend_dispatch_t forward_fct = apir_backend_dispatch_table[cmd_type]; uint32_t ret = forward_fct(enc, dec, ctx); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 6449ccc109146..1e9c2e9356936 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -82,7 +82,30 @@ struct virgl_apir_context { struct virgl_apir_callbacks iface; }; -#define TENSOR_MAX_DEPTH_DEVICE_SUPPORTS_OP 2 -#define TENSOR_MAX_DEPTH_BUFFER_GET_TENSOR 2 -#define TENSOR_MAX_DEPTH_BUFFER_SET_TENSOR 2 -#define TENSOR_MAX_DEPTH_CGRAPH_DATA 10 +extern long long timer_start; +extern long long timer_total; +extern long long timer_count; + +static inline void start_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer_total += (timer_end - timer_start); + timer_count += 1; +} + +static inline void show_timer(void) { + long long ms = timer_total/1000000; + long long itl = ms/timer_count; + float speed = 1/((float)itl) * 1000; + + INFO("compute_graph: [%9ld] ms for %ld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed); + INFO("compute_graph: [%9ld] s", (ms)/1000); +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 055c9b0e10dbb..4b9888ca66386 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -111,6 +111,15 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { /* .get_proc_address = */ NULL, }; +long long timer_start = 0; +long long timer_total = 0; +long long timer_count = 0; + +// needed because `show_timer` is inline +static void showTime() { + show_timer(); +} + ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { struct virtgpu *gpu = apir_initialize(); if (!gpu) { @@ -128,5 +137,8 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { ggml_backend_remoting_reg_init_devices(®); + int cr = atexit(showTime); + assert(cr == 0); + return ® } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index 61c7fc7ac9839..e467bcd722d0a 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -1,9 +1,16 @@ #include "virtgpu-forward-impl.h" +static long long current_time_ms() { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { - UNUSED(cgraph); - + + start_timer(); + struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; @@ -44,5 +51,9 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { if (shmem != gpu->data_shmem) { virtgpu_shmem_destroy(gpu, shmem->shmem); } + + stop_timer(); + return status; } + diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 1f03f8bf31725..239295aa3ac78 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -2,6 +2,8 @@ #include "ggml-impl.h" #include "ggml-alloc.h" +#include "virtgpu-utils.h" + #include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" /* device */ From 55ce372d2a167125ac4173ce1aaa31d97052b62a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:09:24 +0200 Subject: [PATCH 088/117] remoting: cleanup the logs --- .../backend-dispatched-buffer-type.cpp | 5 +---- .../ggml-remotingbackend/backend-dispatched-buffer.cpp | 2 -- ggml/src/ggml-remotingbackend/backend.cpp | 8 ++++++++ ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp | 8 ++++++++ ggml/src/ggml-remotingfrontend/ggml-backend.cpp | 3 --- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 9ff2e79831f87..405685b91527f 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -77,17 +77,14 @@ backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder ggml_backend_buffer_t buffer; #if APIR_ALLOC_FROM_HOST_PTR - WARNING("USING FROM_HOST_PTR\n\n"); #define MAX_TENSOR_SIZE 323205120 buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE); vn_encode_ggml_buffer_type(enc, buffer->buft); #else - WARNING("USING ALLOC_BUFFER"); buffer = buft->iface.alloc_buffer(buft, size); - WARNING("USING ALLOC_BUFFER--> %p", buffer); #endif - + vn_encode_ggml_buffer(enc, buffer); if (buffer) { diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index ea9f31ad1a634..b755e9a946fa9 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -15,8 +15,6 @@ backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, st uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); vn_encode_uintptr_t(enc, &base); - //INFO("%s: send base %p\n", __func__, (void *) base); - return 0; } diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 6eab34acfccdc..5ec77d96257d1 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -69,6 +69,14 @@ extern "C" { return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } + INFO("#"); +#if APIR_ALLOC_FROM_HOST_PTR + INFO("# USING ALLOC_FROM_HOST_PTR"); +#else + INFO("# USING ALLOC_BUFFER"); +#endif + INFO("#"); + return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 4b9888ca66386..6dd8ad8919b94 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -135,6 +135,14 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { INFO("ggml_backend_remoting_frontend_reg() hello :wave:"); + INFO("#"); +#if APIR_ALLOC_FROM_HOST_PTR + INFO("# USING ALLOC_FROM_HOST_PTR"); +#else + INFO("# USING ALLOC_BUFFER"); +#endif + INFO("#"); + ggml_backend_remoting_reg_init_devices(®); int cr = atexit(showTime); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index e4be758af84b3..00144d0ed166c 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -15,9 +15,6 @@ static void ggml_backend_remoting_free(ggml_backend_t backend) { } static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - UNUSED(backend); - UNUSED(cgraph); - struct virtgpu *gpu = DEV_TO_GPU(backend->device); IMPLEMENTED_ONCE; From 559626523fe7ef7760ddeae75baa964837f1a84b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:09:40 +0200 Subject: [PATCH 089/117] ggml: src: ggml-remotingfrontend/ggml-backend-reg: call the initialization functions only once --- ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 6dd8ad8919b94..6d6896b063048 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -133,6 +133,12 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { /* .context = */ gpu, }; + static bool initialized = false; + if (initialized) { + return ® + } + initialized = true; + INFO("ggml_backend_remoting_frontend_reg() hello :wave:"); INFO("#"); From 4f9a2d48a4f65f94c668341b18c98b419788bb54 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 15:10:27 +0200 Subject: [PATCH 090/117] disable APIR_ALLOC_FROM_HOST_PTR --- ggml/src/ggml-remotingbackend/shared/apir_backend.h | 2 +- ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 1e9c2e9356936..0a627ea63b74d 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,7 +14,7 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 -#define APIR_ALLOC_FROM_HOST_PTR 1 +#define APIR_ALLOC_FROM_HOST_PTR 0 typedef uintptr_t apir_buffer_type_host_handle_t; typedef uintptr_t apir_buffer_host_handle_t; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index 18b010583fa6d..dd3f7a5cc0bc5 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -64,7 +64,7 @@ apir_buffer_set_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_contex return; } -#if APIR_ALLOC_FROM_HOST_PTR +#if false void apir_buffer_get_tensor(struct virtgpu *gpu, apir_buffer_context_t *buffer_context, const ggml_tensor *tensor, void *data, size_t offset, size_t size) { From 4fa0b0a1398f50acaf21bd98f0612c719d07d006 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 27 May 2025 16:19:49 +0200 Subject: [PATCH 091/117] remoting: cache the buffer_get_base result --- .../ggml-backend-buffer-type.cpp | 1 + .../ggml-remotingfrontend/ggml-backend-buffer.cpp | 12 +++++++++--- ggml/src/ggml-remotingfrontend/ggml-remoting.h | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 775238d501374..880f982c6c961 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -15,6 +15,7 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->gpu = gpu; context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->base = NULL; ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 99bdbdaca2275..f3f47b325f14a 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -4,11 +4,17 @@ ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { - //IMPLEMENTED; + IMPLEMENTED_ONCE; - struct virtgpu *gpu = BUFFER_TO_GPU(buffer); + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) buffer->context; + if (context->base) { + return context->base; + } + + context->base = apir_buffer_get_base(BUFFER_TO_GPU(buffer), + BUFFER_TO_APIR_CONTEXT(buffer)); - return apir_buffer_get_base(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); + return context->base; } static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 71708e75a5d4a..05797775cf081 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -88,6 +88,8 @@ struct ggml_backend_remoting_buffer_context { apir_buffer_context_t apir_context; struct virtgpu *gpu; + + void *base; }; extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; From 609c74391068401aa10835cd4a85ec6be61b9e00 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 May 2025 14:05:44 +0200 Subject: [PATCH 092/117] examples: run: run: improve the timing measurement --- examples/run/run.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 42db6ef659980..4bd97da0f1d0d 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -985,13 +985,11 @@ static inline void stop_timer(void) { } static void show_timer(void) { - //printe("[%15lld] ns\n", timer_total); - long long ms = timer_total/1000000; - long long itl = ms/timer_count; - float speed = 1/((float)itl) * 1000; - printe("INFO: generate: [%7lld] ms for %lld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed); + double ms = timer_total/1000000; + double itl = ms/timer_count; + double speed = 1/itl * 1000; - printe("INFO: generate: [%7lld] s\n", timer_total/1000000/1000); + printe("LLAMA generate [%9.0f] ms for %4lld invocations | ITL %2.2f ms | throughput = %4.2f t/s\n", ms, timer_count, itl, speed); } From 6ea6a298c676221a26b9a73df4572e944ad7cb4c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 May 2025 14:06:07 +0200 Subject: [PATCH 093/117] examples: run: run: remove the stop after 25 tokens --- examples/run/run.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 4bd97da0f1d0d..2107b37af476f 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1009,14 +1009,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size()); llama_token new_token_id; - int count = 0; while (true) { -#if 0 - if (count > 25) { - printe("WARNING: stopping after %d tokens", count); - break; - } -#endif start_timer(); check_context_size(llama_data.context, batch); if (llama_decode(llama_data.context.get(), batch)) { @@ -1040,7 +1033,6 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str // prepare the next batch with the sampled token batch = llama_batch_get_one(&new_token_id, 1); stop_timer(); - count += 1; } printf(LOG_COL_DEFAULT); From aac3ca898e94dcd50781398dfd84452c02f24b68 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 May 2025 14:06:53 +0200 Subject: [PATCH 094/117] remoting: improve the timing measurement --- ggml/src/ggml-metal/ggml-metal.m | 46 +++++++++++++++++++ .../backend-dispatched-backend.cpp | 6 ++- .../backend-dispatched-buffer.cpp | 11 +++++ .../backend-dispatched.cpp | 4 +- ggml/src/ggml-remotingbackend/backend.cpp | 7 +-- .../shared/apir_backend.h | 35 ++++++++------ .../ggml-backend-buffer.cpp | 12 +++++ .../ggml-backend-reg.cpp | 8 ++-- .../ggml-remotingfrontend/ggml-backend.cpp | 10 +++- .../virtgpu-forward-backend.cpp | 8 +--- 10 files changed, 113 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 97f426cbd3e13..777c868e949aa 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4485,9 +4485,53 @@ static void ggml_metal_encode_node( } } +long long timer_start; +long long timer_total; +long long timer_count; + +static inline void start_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +static inline void stop_timer(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time + long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + + timer_total += (timer_end - timer_start); + timer_count += 1; +} + +static void show_timer(void) { + double ms = timer_total/1000000; + double itl = ms/timer_count; + double speed = 1/itl * 1000; + + printf("METAL compute_graph: [%9.0f] ms for %lld invokations | ITL %.2f ms | throughput = %.2f t/s\n",ms, timer_count, itl, speed); + + timer_start = 0; + timer_total = 1; // to avoid re-registering + timer_count = 0; +} + +static void show_timer_signal(int sig) { + GGML_UNUSED(sig); + show_timer(); +} + static enum ggml_status ggml_metal_graph_compute( ggml_backend_t backend, struct ggml_cgraph * gf) { + + if (timer_total == 0) { + signal(SIGUSR1, show_timer_signal); // kill -USR1 $(cat /tmp/krunkit.pid) + atexit(show_timer); + } + + start_timer(); + struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -4615,6 +4659,8 @@ static enum ggml_status ggml_metal_graph_compute( } } + stop_timer(); + return GGML_STATUS_SUCCESS; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index cf416156c483b..6e600843a48db 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -8,12 +8,14 @@ #include "shared/apir_backend.h" +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + uint32_t backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); UNUSED(enc); - start_timer(); + start_timer(&graph_compute_timer); uint32_t shmem_res_id; vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); @@ -34,7 +36,7 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru vn_encode_ggml_status(enc, &status); - stop_timer(); + stop_timer(&graph_compute_timer); return 0; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp index b755e9a946fa9..fc1ccaef6748d 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer.cpp @@ -6,6 +6,9 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + uint32_t backend_buffer_get_base(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); @@ -23,6 +26,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, UNUSED(ctx); UNUSED(enc); + start_timer(&set_tensor_timer); + ggml_backend_buffer_t buffer; buffer = vn_decode_ggml_buffer(dec); @@ -60,6 +65,8 @@ backend_buffer_set_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, buffer->iface.set_tensor(buffer, tensor, shmem_data, offset, size); + stop_timer(&set_tensor_timer); + return 0; } @@ -68,6 +75,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, UNUSED(ctx); UNUSED(enc); + start_timer(&get_tensor_timer); + ggml_backend_buffer_t buffer; buffer = vn_decode_ggml_buffer(dec); @@ -94,6 +103,8 @@ backend_buffer_get_tensor(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, UNUSED(tensor); buffer->iface.get_tensor(buffer, tensor, shmem_data, offset, size); + stop_timer(&get_tensor_timer); + return 0; } diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp index 6038698fa9c05..d90424a3d714f 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.cpp @@ -31,9 +31,9 @@ uint32_t backend_dispatch_initialize(void *ggml_backend_reg_fct_p, void *ggml_ba dev = reg->iface.get_device(reg, 0); } - ggml_backend_t (* ggml_backend_fct)(void) = (ggml_backend_t (*)()) ggml_backend_init_fct_p; + ggml_backend_t (* ggml_backend_fct)(int) = (ggml_backend_t (*)(int)) ggml_backend_init_fct_p; - bck = ggml_backend_fct(); + bck = ggml_backend_fct(0); if (!bck) { ERROR("%s: backend initialization failed :/", __func__); return APIR_BACKEND_INITIALIZE_BACKEND_FAILED; diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 5ec77d96257d1..5bc6c923f405a 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -28,8 +28,9 @@ extern "C" { dev->iface.get_memory(dev, &free, &total); WARNING("%s: free memory: %ld MB\n", __func__, (size_t) free/1024/1024); - show_timer(); - + show_timer(&graph_compute_timer); + show_timer(&set_tensor_timer); + show_timer(&get_tensor_timer); /* *** */ if (backend_library_handle) { @@ -43,7 +44,7 @@ extern "C" { uint32_t apir_backend_initialize() { const char* dlsym_error; - INFO("%s: hello :wave: \\o/", __func__); + INFO("%s: hello " GGML_BACKEND_REG_FCT_NAME " :wave: \\o/", __func__); backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 0a627ea63b74d..ad1747b17d182 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -82,30 +82,37 @@ struct virgl_apir_context { struct virgl_apir_callbacks iface; }; -extern long long timer_start; -extern long long timer_total; -extern long long timer_count; +struct timer_data { + long long start; + long long total; + long long count; + const char *name; +}; + +extern struct timer_data graph_compute_timer; +extern struct timer_data get_tensor_timer; +extern struct timer_data set_tensor_timer; -static inline void start_timer(void) { +static inline void start_timer(struct timer_data *timer) { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time - timer_start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; + timer->start = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; } -static inline void stop_timer(void) { +static inline void stop_timer(struct timer_data *timer) { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); // Use CLOCK_MONOTONIC for elapsed time long long timer_end = (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; - timer_total += (timer_end - timer_start); - timer_count += 1; + timer->total += (timer_end - timer->start); + timer->count += 1; } -static inline void show_timer(void) { - long long ms = timer_total/1000000; - long long itl = ms/timer_count; - float speed = 1/((float)itl) * 1000; +static inline void show_timer(struct timer_data *timer) { + double ms = timer->total/1000000; + double itl = ms/timer->count; + double speed = 1/itl * 1000; - INFO("compute_graph: [%9ld] ms for %ld invokations | ITL %lldms | throughput = %.2f t/s\n", timer_total/1000000, timer_count, itl, speed); - INFO("compute_graph: [%9ld] s", (ms)/1000); + INFO("%14s [%9.0f] ms for %4ld invocations | ITL %2.2f ms | throughput = %4.2f t/s", + timer->name, ms, timer->count, itl, speed); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index f3f47b325f14a..d056249bdf681 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -3,6 +3,9 @@ #define BUFFER_TO_GPU(name) \ ((struct ggml_backend_remoting_buffer_context *) (name)->context)->gpu +struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; +struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; + static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { IMPLEMENTED_ONCE; @@ -32,6 +35,8 @@ static void ggml_backend_remoting_buffer_memset_tensor(ggml_backend_buffer_t buf static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { IMPLEMENTED_ONCE; + start_timer(&set_tensor_timer); + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); #if 0 INFO("%s: data=%p, offset=%lu, size=%lu\n", __func__, data, offset, size); @@ -46,14 +51,21 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer #endif apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + stop_timer(&set_tensor_timer); + return; } static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { IMPLEMENTED_ONCE; + + start_timer(&get_tensor_timer); + struct virtgpu *gpu = BUFFER_TO_GPU(buffer); apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + + stop_timer(&get_tensor_timer); } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index 6d6896b063048..e9b22071af224 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -111,13 +111,11 @@ static const struct ggml_backend_reg_i ggml_backend_remoting_reg_i = { /* .get_proc_address = */ NULL, }; -long long timer_start = 0; -long long timer_total = 0; -long long timer_count = 0; -// needed because `show_timer` is inline static void showTime() { - show_timer(); + show_timer(&graph_compute_timer); + show_timer(&get_tensor_timer); + show_timer(&set_tensor_timer); } ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp index 00144d0ed166c..14f95ec88ff02 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend.cpp @@ -14,12 +14,20 @@ static void ggml_backend_remoting_free(ggml_backend_t backend) { delete backend; } +struct timer_data graph_compute_timer = {0, 0, 0, "compute_timer"}; + static ggml_status ggml_backend_remoting_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { struct virtgpu *gpu = DEV_TO_GPU(backend->device); IMPLEMENTED_ONCE; - return apir_backend_graph_compute(gpu, cgraph); + start_timer(&graph_compute_timer); + + ggml_status status = apir_backend_graph_compute(gpu, cgraph); + + stop_timer(&graph_compute_timer); + + return status; } static ggml_backend_i ggml_backend_remoting_interface = { diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp index e467bcd722d0a..82b51838997c6 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-backend.cpp @@ -8,9 +8,6 @@ static long long current_time_ms() { ggml_status apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { - - start_timer(); - struct vn_cs_encoder *encoder; struct vn_cs_decoder *decoder; @@ -51,9 +48,6 @@ apir_backend_graph_compute(struct virtgpu *gpu, ggml_cgraph *cgraph) { if (shmem != gpu->data_shmem) { virtgpu_shmem_destroy(gpu, shmem->shmem); } - - stop_timer(); - + return status; } - From b4837da71785859eb1390da7a5bde235cb7325c1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 May 2025 14:07:08 +0200 Subject: [PATCH 095/117] remoting: allow compiling to Vulkan --- ggml/src/ggml-remotingbackend/backend.cpp | 8 ++++++++ prepare.backend.sh | 11 ++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index 5bc6c923f405a..fa2344ea8f676 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -10,9 +10,17 @@ #include "shared/apir_backend.h" #include "shared/venus_cs.h" +#define USE_METAL 1 + +#if USE_METAL #define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" #define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg" #define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init" +#else +#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-vulkan.dylib" +#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_vk_reg" +#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_vk_init" +#endif static void *backend_library_handle = NULL; diff --git a/prepare.backend.sh b/prepare.backend.sh index 76e30fe31cfa4..caed8223382e9 100755 --- a/prepare.backend.sh +++ b/prepare.backend.sh @@ -1,6 +1,15 @@ -cmake -S . -B ../build.remoting-backend-prod \ +if [[ "${PERF_MODE:-}" ]]; then + FLAVOR="-prod" +else + FLAVOR="" +fi + +cmake -S . -B ../build.remoting-backend$FLAVOR \ -DGGML_REMOTINGBACKEND=ON \ -DGGML_NATIVE=OFF \ + -DGGML_METAL=ON \ + -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \ "$@" # -DCMAKE_BUILD_TYPE=Debug \ +# From ecb7a235736c878a5f0a3642139ba1d9ff532cc9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 28 May 2025 14:07:23 +0200 Subject: [PATCH 096/117] ggml: src: ggml-remotingfrontend/virtgpu: reduce the response time wait delay --- ggml/src/ggml-remotingfrontend/virtgpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.cpp b/ggml/src/ggml-remotingfrontend/virtgpu.cpp index 39ed7b3a99f95..66bbf17ac6d63 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu.cpp @@ -481,7 +481,7 @@ remote_call( */ while (std::atomic_load_explicit(atomic_reply_notif, std::memory_order_acquire) == 0) { - int64_t base_sleep_us = 160; + int64_t base_sleep_us = 15; os_time_sleep(base_sleep_us); } From 3f3624411ac1c50172bf87ddc08afd27664b7e0c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 2 Jun 2025 14:03:23 +0200 Subject: [PATCH 097/117] remoting: experiement with buffer_from_ptr --- .../backend-dispatched-buffer-type.cpp | 16 +---- .../backend-dispatched-device.cpp | 31 +++++++++ .../ggml-remotingbackend/backend-dispatched.h | 3 + ggml/src/ggml-remotingbackend/backend.cpp | 8 --- .../shared/apir_backend.h | 30 ++++----- .../venus_cs_ggml-rpc-back.cpp | 11 +++- .../ggml-backend-buffer-type.cpp | 14 ++++ .../ggml-backend-buffer.cpp | 44 +++++++++++++ .../ggml-backend-device.cpp | 66 ++++++++++++++----- .../ggml-backend-reg.cpp | 8 --- .../src/ggml-remotingfrontend/ggml-remoting.h | 7 ++ .../venus_cs_ggml-rpc-front.cpp | 2 + .../virtgpu-forward-buffer-type.cpp | 19 +----- .../virtgpu-forward-buffer.cpp | 4 +- .../virtgpu-forward-device.cpp | 37 +++++++++++ .../ggml-remotingfrontend/virtgpu-forward.h | 4 +- 16 files changed, 215 insertions(+), 89 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp index 405685b91527f..f925d1e066fc0 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-buffer-type.cpp @@ -60,30 +60,16 @@ backend_buffer_type_is_host(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec uint32_t backend_buffer_type_alloc_buffer(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); -#if APIR_ALLOC_FROM_HOST_PTR - uint32_t shmem_res_id; - vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); - void *shmem_data = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); - if (!shmem_data) { - FATAL("Couldn't get the shmem addr from virgl :/"); - } -#else ggml_backend_buffer_type_t buft; buft = vn_decode_ggml_buffer_type(dec); -#endif + size_t size; vn_decode_size_t(dec, &size); ggml_backend_buffer_t buffer; -#if APIR_ALLOC_FROM_HOST_PTR - #define MAX_TENSOR_SIZE 323205120 - buffer = dev->iface.buffer_from_host_ptr(dev, shmem_data, size, MAX_TENSOR_SIZE); - vn_encode_ggml_buffer_type(enc, buffer->buft); -#else buffer = buft->iface.alloc_buffer(buft, size); -#endif vn_encode_ggml_buffer(enc, buffer); diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 18f0e0a81b6a6..5bf0788ccf864 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -109,3 +109,34 @@ backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, s return 0; } + +uint32_t +backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { + UNUSED(ctx); + UNUSED(dec); + + uint32_t shmem_res_id; + vn_decode_virtgpu_shmem_res_id(dec, &shmem_res_id); + + void *shmem_ptr = ctx->iface.get_shmem_ptr(ctx->virgl_ctx, shmem_res_id); + if (!shmem_ptr) { + FATAL("Couldn't get the shmem addr from virgl :/"); + } + + size_t size; + vn_decode_size_t(dec, &size); + size_t max_tensor_size; + vn_decode_size_t(dec, &max_tensor_size); + + ggml_backend_buffer_t buffer; + buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); + + vn_encode_ggml_buffer(enc, buffer); + vn_encode_ggml_buffer_type(enc, buffer->buft); + + if (buffer) { + track_backend_buffer(buffer); + } + + return 0; +} diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched.h b/ggml/src/ggml-remotingbackend/backend-dispatched.h index d8d86fc3f67f5..3c164b532ac95 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched.h +++ b/ggml/src/ggml-remotingbackend/backend-dispatched.h @@ -27,6 +27,7 @@ uint32_t backend_device_get_memory(struct vn_cs_encoder *enc, struct vn_cs_decod uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_device_get_buffer_type(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); uint32_t backend_device_get_props(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); +uint32_t backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); /* buffer-type */ uint32_t backend_buffer_type_get_name(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx); @@ -57,6 +58,7 @@ static inline const char *backend_dispatch_command_name(ApirBackendCommandType t case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: return "backend_device_supports_op"; case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: return "backend_get_buffer_type"; case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: return "backend_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: return "backend_buffer_from_ptr"; /* buffer-type */ case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: return "backend_buffer_type_get_name"; @@ -88,6 +90,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC [APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP] = backend_device_supports_op, [APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE] = backend_device_get_buffer_type, [APIR_COMMAND_TYPE_DEVICE_GET_PROPS] = backend_device_get_props, + [APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR] = backend_device_buffer_from_ptr, /* buffer-type */ [APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME] = backend_buffer_type_get_name, diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index fa2344ea8f676..f5a10c234644a 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -78,14 +78,6 @@ extern "C" { return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } - INFO("#"); -#if APIR_ALLOC_FROM_HOST_PTR - INFO("# USING ALLOC_FROM_HOST_PTR"); -#else - INFO("# USING ALLOC_BUFFER"); -#endif - INFO("#"); - return backend_dispatch_initialize(ggml_backend_reg_fct, ggml_backend_init_fct); } diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index ad1747b17d182..efd0803a929d5 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -14,17 +14,14 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 -#define APIR_ALLOC_FROM_HOST_PTR 0 - typedef uintptr_t apir_buffer_type_host_handle_t; typedef uintptr_t apir_buffer_host_handle_t; typedef struct { apir_buffer_host_handle_t host_handle; -#if APIR_ALLOC_FROM_HOST_PTR + struct vn_renderer_shmem *shmem; apir_buffer_type_host_handle_t buft_host_handle; -#endif } apir_buffer_context_t; typedef uint32_t (*apir_backend_initialize_t)(void); @@ -49,26 +46,27 @@ typedef enum ApirBackendCommandType { APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP = 5, APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE = 6, APIR_COMMAND_TYPE_DEVICE_GET_PROPS = 7, + APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR = 8, /* buffer-type */ - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 8, - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 9, - APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 10, - APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 11, - APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 12, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = 9, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = 10, + APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = 11, + APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = 12, + APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = 13, /* buffer */ - APIR_COMMAND_TYPE_BUFFER_GET_BASE = 13, - APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 14, - APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 15, - APIR_COMMAND_TYPE_BUFFER_CLEAR = 16, - APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 17, + APIR_COMMAND_TYPE_BUFFER_GET_BASE = 14, + APIR_COMMAND_TYPE_BUFFER_SET_TENSOR = 15, + APIR_COMMAND_TYPE_BUFFER_GET_TENSOR = 16, + APIR_COMMAND_TYPE_BUFFER_CLEAR = 17, + APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER = 18, /* backend */ - APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 18, + APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE = 19, // last command_type index + 1 - APIR_BACKEND_DISPATCH_TABLE_COUNT = 19, + APIR_BACKEND_DISPATCH_TABLE_COUNT = 20, } ApirBackendCommandType; diff --git a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp index 58a142ae93d5b..30ae511aa95e8 100644 --- a/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp +++ b/ggml/src/ggml-remotingbackend/venus_cs_ggml-rpc-back.cpp @@ -43,13 +43,18 @@ deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { result->buffer = nullptr; } + uint64_t tensor_data = tensor->data; if (result->buffer) { // require that the tensor data does not go beyond the buffer end uint64_t tensor_size = (uint64_t) ggml_nbytes(result); uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); - GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow - GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + + // tensor->data is serialized as an offset to the buffer base address + tensor_data += buffer_start; + + GGML_ASSERT(tensor_data + tensor_size >= tensor_data); // check for overflow + GGML_ASSERT(tensor_data >= buffer_start && tensor_data + tensor_size <= buffer_start + buffer_size); } result->op = (ggml_op) tensor->op; @@ -57,7 +62,7 @@ deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { result->op_params[i] = tensor->op_params[i]; } result->flags = tensor->flags; - result->data = reinterpret_cast(tensor->data); + result->data = reinterpret_cast(tensor_data); ggml_set_name(result, tensor->name); return result; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 880f982c6c961..5e67e82874e4d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -16,8 +16,13 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->gpu = gpu; context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); context->base = NULL; + context->is_from_ptr = false; + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + INFO("##"); + INFO("## %s(%llx) --> %p", __func__, size, buffer); + INFO("##\n"); return buffer; } @@ -65,4 +70,13 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { /* .is_host = */ NULL, }; +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + /****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index d056249bdf681..67dd06843495d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -6,6 +6,9 @@ struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; +struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"}; +struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"}; + static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { IMPLEMENTED_ONCE; @@ -68,6 +71,31 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer stop_timer(&get_tensor_timer); } +static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_from_ptr_timer); + + UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); + + stop_timer(&set_tensor_from_ptr_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + UNUSED(buffer); + + start_timer(&get_tensor_from_ptr_timer); + + memcpy(data, (const char *)tensor->data + offset, size); + + stop_timer(&get_tensor_from_ptr_timer); +} static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { NOT_IMPLEMENTED; @@ -99,6 +127,10 @@ static void ggml_backend_remoting_buffer_free_buffer(ggml_backend_buffer_t buffe struct virtgpu *gpu = BUFFER_TO_GPU(buffer); apir_buffer_free_buffer(gpu, BUFFER_TO_APIR_CONTEXT(buffer)); + + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + free(context); + buffer->context = NULL; } const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { @@ -112,3 +144,15 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { /* .clear = */ ggml_backend_remoting_buffer_clear, /* .reset = */ NULL, }; + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 092c05b9e43f3..bc40f9dbb2238 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -103,7 +103,7 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe // the API Remoting frontend props->caps.async = false; props->caps.host_buffer = false; - props->caps.buffer_from_host_ptr = false; + props->caps.buffer_from_host_ptr = true; props->caps.events = false; #endif @@ -129,29 +129,59 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { return &buft; } -static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - UNUSED(dev); - UNUSED(ptr); - UNUSED(size); - UNUSED(max_tensor_size); +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; - NOT_IMPLEMENTED; - STOP_HERE; + struct virtgpu *gpu = DEV_TO_GPU(dev); - return nullptr; + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; } -static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { +static ggml_backend_buffer_t +ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - static struct ggml_backend_buffer_type host_bufft = { - /* .iface = */ ggml_backend_remoting_host_buffer_type_interface, - /* .device = */ dev, - /* .context = */ nullptr, - }; + struct virtgpu *gpu = DEV_TO_GPU(dev); - //IMPLEMENTED; + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + UNUSED(ptr); + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); + context->base = ptr; + context->is_from_ptr = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); + + INFO("#"); + INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer); + INFO("#\n"); + + return buffer; +} + +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + static struct ggml_backend_buffer_type host_bufft = { + /* .iface = */ ggml_backend_remoting_host_buffer_type_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; - return &host_bufft; + return &host_bufft; } const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { @@ -163,7 +193,7 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, /* .offload_op = */ ggml_backend_remoting_device_offload_op, diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp index e9b22071af224..d0132370d9f91 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-reg.cpp @@ -139,14 +139,6 @@ ggml_backend_reg_t ggml_backend_remoting_frontend_reg() { INFO("ggml_backend_remoting_frontend_reg() hello :wave:"); - INFO("#"); -#if APIR_ALLOC_FROM_HOST_PTR - INFO("# USING ALLOC_FROM_HOST_PTR"); -#else - INFO("# USING ALLOC_BUFFER"); -#endif - INFO("#"); - ggml_backend_remoting_reg_init_devices(®); int cr = atexit(showTime); diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 05797775cf081..4da3b9432f1f8 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -13,6 +13,9 @@ #define DEV_TO_GPU(name) \ ((struct ggml_backend_remoting_device_context *) (name)->context)->gpu +#define BUFFER_TO_GGML_CONTEXT(name) \ + ((struct ggml_backend_remoting_buffer_context *) (name)->context) + #define BUFFER_TO_APIR_CONTEXT(name) \ &((struct ggml_backend_remoting_buffer_context *) (name)->context)->apir_context @@ -90,12 +93,16 @@ struct ggml_backend_remoting_buffer_context { struct virtgpu *gpu; void *base; + + bool is_from_ptr; }; extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface; extern const struct ggml_backend_device_i ggml_backend_remoting_device_interface; extern const ggml_backend_buffer_type_i ggml_backend_remoting_host_buffer_type_interface; extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface; +extern const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface; +extern const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface; ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device); ggml_backend_buffer_type_t ggml_backend_remoting_host_buffer_type(); diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp index bc4b96b84f365..67b8c37748aa8 100644 --- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -40,6 +40,8 @@ serialize_tensor(const ggml_tensor * tensor) { result.view_src = reinterpret_cast(tensor->view_src); result.view_offs = tensor->view_offs; result.data = reinterpret_cast(tensor->data); + // tensor->data is serialized as an offset to the buffer base address + result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); return result; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp index 4f7aac1360124..e991c0bef324d 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer-type.cpp @@ -100,29 +100,14 @@ apir_buffer_type_alloc_buffer(struct virtgpu *gpu, ggml_backend_buffer_type_t bu REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); -#if APIR_ALLOC_FROM_HOST_PTR - UNUSED(buft); - - buffer_context.shmem = virtgpu_shmem_create(gpu, size); - //WARNING("%s: 0x%lx | %dkB | %dMB", __func__, size, (int)size/1024, (int)size/1024/1024); - if (!buffer_context.shmem) { - FATAL("Couldn't allocate the guest-host shared buffer :/"); - } - - vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); -#else vn_encode_ggml_buffer_type(encoder, buft); -#endif + vn_encode_size_t(encoder, &size); REMOTE_CALL(gpu, encoder, decoder); -#if APIR_ALLOC_FROM_HOST_PTR - buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); -#endif - vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); - + REMOTE_CALL_FINISH(gpu, encoder, decoder); return buffer_context; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp index dd3f7a5cc0bc5..04041ab5feb37 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-buffer.cpp @@ -140,8 +140,6 @@ apir_buffer_free_buffer(struct virtgpu *gpu, apir_buffer_context_t *buffer_conte vn_encode_apir_buffer_host_handle_t(encoder, &buffer_context->host_handle); REMOTE_CALL(gpu, encoder, decoder); -#if APIR_ALLOC_FROM_HOST_PTR - virtgpu_shmem_destroy(gpu, buffer_context->shmem->shmem); -#endif + REMOTE_CALL_FINISH(gpu, encoder, decoder); } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index ffc6febf4cab0..9a2b6d7c501b4 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -200,3 +200,40 @@ apir_device_get_props(struct virtgpu *gpu, return; } + +apir_buffer_context_t +apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size) { + struct vn_cs_encoder *encoder; + struct vn_cs_decoder *decoder; + apir_buffer_context_t buffer_context; + + BEING_IMPLEMENTED; + + REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); + + /* *** */ + + buffer_context.shmem = virtgpu_shmem_create(gpu, size); + if (!buffer_context.shmem) { + FATAL("Couldn't allocate the guest-host shared buffer :/"); + } + + vn_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem->res_id); + + vn_encode_size_t(encoder, &size); + vn_encode_size_t(encoder, &max_tensor_size); + + REMOTE_CALL(gpu, encoder, decoder); + + vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); + + buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); + + /* *** */ + + REMOTE_CALL_FINISH(gpu, encoder, decoder); + + return buffer_context; +} diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index 239295aa3ac78..bbe94f14300ef 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -19,7 +19,9 @@ void apir_device_get_props(struct virtgpu *gpu, bool *host_buffer, bool *buffer_from_host_ptr, bool *events); - +apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu *gpu, + size_t size, + size_t max_tensor_size); /* buffer-type */ const char *apir_buffer_type_get_name(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); size_t apir_buffer_type_get_alignment(struct virtgpu *gpu, ggml_backend_buffer_type_t buft); From b20672bfd6d59168e0d0935e87b01868adee626d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 2 Jun 2025 14:53:25 +0200 Subject: [PATCH 098/117] remoting: remove from_ptr code --- .../ggml-backend-buffer-type.cpp | 11 ---- .../ggml-backend-buffer.cpp | 41 --------------- .../ggml-backend-device.cpp | 52 ++----------------- 3 files changed, 5 insertions(+), 99 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 5e67e82874e4d..1cef55b620811 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -16,8 +16,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->gpu = gpu; context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); context->base = NULL; - context->is_from_ptr = false; - ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); INFO("##"); @@ -70,13 +68,4 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { /* .is_host = */ NULL, }; -const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { - /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, - /* .alloc_buffer = */ NULL, - /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, -}; - /****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 67dd06843495d..d35d5c9b66cd7 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -6,9 +6,6 @@ struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; -struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"}; -struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"}; - static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { IMPLEMENTED_ONCE; @@ -71,32 +68,6 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer stop_timer(&get_tensor_timer); } -static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - IMPLEMENTED_ONCE; - - start_timer(&set_tensor_from_ptr_timer); - - UNUSED(buffer); - - memcpy((char *)tensor->data + offset, data, size); - - stop_timer(&set_tensor_from_ptr_timer); - - return; -} - -static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - IMPLEMENTED_ONCE; - - UNUSED(buffer); - - start_timer(&get_tensor_from_ptr_timer); - - memcpy(data, (const char *)tensor->data + offset, size); - - stop_timer(&get_tensor_from_ptr_timer); -} - static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { NOT_IMPLEMENTED; @@ -144,15 +115,3 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { /* .clear = */ ggml_backend_remoting_buffer_clear, /* .reset = */ NULL, }; - -const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { - /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, - /* .get_base = */ ggml_backend_remoting_buffer_get_base, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, - /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, - /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, - /* .clear = */ ggml_backend_remoting_buffer_clear, - /* .reset = */ NULL, -}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index bc40f9dbb2238..190d0d77d6551 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -102,12 +102,12 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe // ignore the actual backend answers and set it as we provide it in // the API Remoting frontend props->caps.async = false; - props->caps.host_buffer = false; - props->caps.buffer_from_host_ptr = true; + props->caps.host_buffer = true; + props->caps.buffer_from_host_ptr = false; props->caps.events = false; #endif - INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", + INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d", __func__, props->caps.async, props->caps.host_buffer, props->caps.buffer_from_host_ptr, props->caps.events); } @@ -129,48 +129,6 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { return &buft; } -static ggml_backend_buffer_type_t -ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { - IMPLEMENTED_ONCE; - - struct virtgpu *gpu = DEV_TO_GPU(dev); - - apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); - - static struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, - /* .device = */ dev, - /* .context = */ (void *) ctx, - }; - - return &buft; -} - -static ggml_backend_buffer_t -ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { - - struct virtgpu *gpu = DEV_TO_GPU(dev); - - struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); - if (!context) { - FATAL("Couldn't allocate the buffer context ..."); - } - - UNUSED(ptr); - context->gpu = gpu; - context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); - context->base = ptr; - context->is_from_ptr = true; - - ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); - - INFO("#"); - INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer); - INFO("#\n"); - - return buffer; -} - static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { IMPLEMENTED_ONCE; @@ -192,8 +150,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { /* .get_props = */ ggml_backend_remoting_device_get_props, /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, + /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, /* .offload_op = */ ggml_backend_remoting_device_offload_op, From f0127dc942e23368e8d746fc6fcb2931a89c7263 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 2 Jun 2025 15:59:21 +0200 Subject: [PATCH 099/117] remoting: try host_pointer --- .../backend-dispatched-device.cpp | 1 + .../ggml-backend-buffer-type.cpp | 4 ++- .../ggml-backend-device.cpp | 4 +-- .../ggml-backend-host-buffer-type.cpp | 26 +++++++++---------- .../src/ggml-remotingfrontend/ggml-remoting.h | 1 + .../virtgpu-forward-device.cpp | 2 +- 6 files changed, 20 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 5bf0788ccf864..13d32194b1668 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -131,6 +131,7 @@ backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder * ggml_backend_buffer_t buffer; buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); + INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer, size); vn_encode_ggml_buffer(enc, buffer); vn_encode_ggml_buffer_type(enc, buffer->buft); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 1cef55b620811..d462b23a0ad85 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -16,10 +16,12 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->gpu = gpu; context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); context->base = NULL; + context->is_host_buffer = false; + context->is_from_ptr = false; ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); INFO("##"); - INFO("## %s(%llx) --> %p", __func__, size, buffer); + INFO("## %s(%llx) --> %p <---------------", __func__, size, buffer); INFO("##\n"); return buffer; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 190d0d77d6551..07c65276146f1 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -102,8 +102,8 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe // ignore the actual backend answers and set it as we provide it in // the API Remoting frontend props->caps.async = false; - props->caps.host_buffer = true; - props->caps.buffer_from_host_ptr = false; + props->caps.host_buffer = false; + props->caps.buffer_from_host_ptr = true; props->caps.events = false; #endif diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp index 20159faf3cae9..c09c80d6472f5 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-host-buffer-type.cpp @@ -42,25 +42,23 @@ ggml_backend_remoting_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { static ggml_backend_buffer_t ggml_backend_remoting_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { IMPLEMENTED; - struct virtgpu *gpu = BUFT_TO_GPU(buft); - - struct ggml_backend_remoting_device_context *device_ctx = GET_DEVICE_CONTEXT(); - size += 32; // Behave like the CPU buffer type (dixit ggml-vulkan) - - struct vn_renderer_shmem *shmem = virtgpu_shmem_create(gpu, size); + struct virtgpu *gpu = BUFT_TO_GPU(buft); - if (!shmem) { - FATAL("Couldn't allocate the guest-host shared host buffer :/"); + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); } - void *ptr = shmem->mmap_ptr; - - device_ctx->shared_memory.push_back(std::make_tuple(ptr, size, shmem)); + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_host_buffer = true; - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_remoting_host_buffer_free_buffer; + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); + INFO("##"); + INFO("## %s(%llx) --> %p <======================", __func__, size, buffer); + INFO("##\n"); return buffer; } diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 4da3b9432f1f8..18b880c740564 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -94,6 +94,7 @@ struct ggml_backend_remoting_buffer_context { void *base; + bool is_host_buffer; bool is_from_ptr; }; diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 9a2b6d7c501b4..0d74b55c2083c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -228,7 +228,7 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu, REMOTE_CALL(gpu, encoder, decoder); vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); - + INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer_context.host_handle, size); buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); /* *** */ From 11e2ebaf641b7870633af1ee246e3f6dbb5ec5a0 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 2 Jun 2025 16:07:11 +0200 Subject: [PATCH 100/117] remoting: try from_host_ptr --- .../ggml-backend-buffer-type.cpp | 9 ++++ .../ggml-backend-buffer.cpp | 41 ++++++++++++++++ .../ggml-backend-device.cpp | 49 +++++++++++++++++-- 3 files changed, 95 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index d462b23a0ad85..86ee8a8bf0f3b 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -70,4 +70,13 @@ const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { /* .is_host = */ NULL, }; +const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_from_ptr_type_interface = { + /* .get_name = */ ggml_backend_remoting_buffer_type_get_name, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ ggml_backend_remoting_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_remoting_buffer_type_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, +}; + /****************************************************************************************/ diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index d35d5c9b66cd7..67dd06843495d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -6,6 +6,9 @@ struct timer_data get_tensor_timer = {0, 0, 0, "get_tensor"}; struct timer_data set_tensor_timer = {0, 0, 0, "set_tensor"}; +struct timer_data get_tensor_from_ptr_timer = {0, 0, 0, "get_tensor_from_ptr"}; +struct timer_data set_tensor_from_ptr_timer = {0, 0, 0, "set_tensor_from_ptr"}; + static void * ggml_backend_remoting_buffer_get_base(ggml_backend_buffer_t buffer) { IMPLEMENTED_ONCE; @@ -68,6 +71,32 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer stop_timer(&get_tensor_timer); } +static void ggml_backend_remoting_buffer_set_tensor_from_ptr(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + start_timer(&set_tensor_from_ptr_timer); + + UNUSED(buffer); + + memcpy((char *)tensor->data + offset, data, size); + + stop_timer(&set_tensor_from_ptr_timer); + + return; +} + +static void ggml_backend_remoting_buffer_get_tensor_from_ptr(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + IMPLEMENTED_ONCE; + + UNUSED(buffer); + + start_timer(&get_tensor_from_ptr_timer); + + memcpy(data, (const char *)tensor->data + offset, size); + + stop_timer(&get_tensor_from_ptr_timer); +} + static bool ggml_backend_remoting_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { NOT_IMPLEMENTED; @@ -115,3 +144,15 @@ const ggml_backend_buffer_i ggml_backend_remoting_buffer_interface = { /* .clear = */ ggml_backend_remoting_buffer_clear, /* .reset = */ NULL, }; + +const ggml_backend_buffer_i ggml_backend_remoting_buffer_from_ptr_interface = { + /* .free_buffer = */ ggml_backend_remoting_buffer_free_buffer, + /* .get_base = */ ggml_backend_remoting_buffer_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_remoting_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_remoting_buffer_set_tensor_from_ptr, + /* .get_tensor = */ ggml_backend_remoting_buffer_get_tensor_from_ptr, + /* .cpy_tensor = */ ggml_backend_remoting_buffer_cpy_tensor, + /* .clear = */ ggml_backend_remoting_buffer_clear, + /* .reset = */ NULL, +}; diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 07c65276146f1..dfe1e992c9dac 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -103,11 +103,11 @@ ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, struct ggml_backe // the API Remoting frontend props->caps.async = false; props->caps.host_buffer = false; - props->caps.buffer_from_host_ptr = true; + props->caps.buffer_from_host_ptr = false; props->caps.events = false; #endif - INFO("%s: async=%d, host_buffer=%d, buffer_from_host_ptr=%d, events=%d", + INFO("%s: async=%d, host_buffer=%d!, buffer_from_host_ptr=%d!, events=%d", __func__, props->caps.async, props->caps.host_buffer, props->caps.buffer_from_host_ptr, props->caps.events); } @@ -129,6 +129,47 @@ ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { return &buft; } +static ggml_backend_buffer_type_t +ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { + IMPLEMENTED_ONCE; + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + + static struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) ctx, + }; + + return &buft; +} + +static ggml_backend_buffer_t +ggml_backend_remoting_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { + + struct virtgpu *gpu = DEV_TO_GPU(dev); + + struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); + if (!context) { + FATAL("Couldn't allocate the buffer context ..."); + } + + context->gpu = gpu; + context->apir_context = apir_device_buffer_from_ptr(gpu, size, max_tensor_size); + context->base = ptr; + context->is_from_ptr = true; + + ggml_backend_buffer_t buffer = ggml_backend_buffer_init(ggml_backend_remoting_device_get_buffer_from_ptr_type(dev), ggml_backend_remoting_buffer_from_ptr_interface, (void *) context, size); + + INFO("#"); + INFO("# %s(%p, %llx) --> %p", __func__, ptr, size, buffer); + INFO("#\n"); + + return buffer; +} + static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_host_buffer_type(ggml_backend_dev_t dev) { IMPLEMENTED_ONCE; @@ -150,8 +191,8 @@ const struct ggml_backend_device_i ggml_backend_remoting_device_interface = { /* .get_props = */ ggml_backend_remoting_device_get_props, /* .init_backend = */ ggml_backend_remoting_device_init, /* .get_buffer_type = */ ggml_backend_remoting_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_remoting_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ ggml_backend_remoting_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_remoting_device_supports_op, /* .supports_buft = */ ggml_backend_remoting_device_supports_buft, /* .offload_op = */ ggml_backend_remoting_device_offload_op, From efe68cace1c9b3671ca74aa9dcd4e78c028452d6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 2 Jun 2025 16:29:47 +0200 Subject: [PATCH 101/117] remoting: make alloc_memory + alloc_from_host_ptr work :) --- .../backend-dispatched-device.cpp | 1 - .../ggml-backend-buffer-type.cpp | 15 ++++++++++++--- .../ggml-remotingfrontend/ggml-backend-buffer.cpp | 15 ++++++++++++--- .../virtgpu-forward-device.cpp | 3 --- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 13d32194b1668..5bf0788ccf864 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -131,7 +131,6 @@ backend_device_buffer_from_ptr(struct vn_cs_encoder *enc, struct vn_cs_decoder * ggml_backend_buffer_t buffer; buffer = dev->iface.buffer_from_host_ptr(dev, shmem_ptr, size, max_tensor_size); - INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer, size); vn_encode_ggml_buffer(enc, buffer); vn_encode_ggml_buffer_type(enc, buffer->buft); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 86ee8a8bf0f3b..70fc829c24fa4 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -14,10 +14,19 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, } context->gpu = gpu; - context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); - context->base = NULL; + + const int USE_FROM_PTR = true; + + if (USE_FROM_PTR) { + context->apir_context = apir_device_buffer_from_ptr(gpu, size, size); + context->base = context->apir_context.shmem->mmap_ptr; + context->is_from_ptr = true; + } else { + context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->is_from_ptr = false; + context->base = NULL; + } context->is_host_buffer = false; - context->is_from_ptr = false; ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); INFO("##"); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp index 67dd06843495d..e720efcf47c69 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer.cpp @@ -52,7 +52,12 @@ static void ggml_backend_remoting_buffer_set_tensor(ggml_backend_buffer_t buffer } INFO("\n"); #endif - apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy((char *)tensor->data + offset, data, size); + } else { + apir_buffer_set_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } stop_timer(&set_tensor_timer); @@ -65,8 +70,12 @@ static void ggml_backend_remoting_buffer_get_tensor(ggml_backend_buffer_t buffer start_timer(&get_tensor_timer); struct virtgpu *gpu = BUFFER_TO_GPU(buffer); - - apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + struct ggml_backend_remoting_buffer_context *context = BUFFER_TO_GGML_CONTEXT(buffer); + if (context->is_from_ptr) { + memcpy(data, (const char *)tensor->data + offset, size); + } else { + apir_buffer_get_tensor(gpu, BUFFER_TO_APIR_CONTEXT(buffer), tensor, data, offset, size); + } stop_timer(&get_tensor_timer); } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 0d74b55c2083c..06ad6d445de4c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -209,8 +209,6 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu, struct vn_cs_decoder *decoder; apir_buffer_context_t buffer_context; - BEING_IMPLEMENTED; - REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); /* *** */ @@ -228,7 +226,6 @@ apir_device_buffer_from_ptr(struct virtgpu *gpu, REMOTE_CALL(gpu, encoder, decoder); vn_decode_apir_buffer_host_handle_t(decoder, &buffer_context.host_handle); - INFO("HOST HANDLE is %p (size=%llx)", (void*)buffer_context.host_handle, size); buffer_context.buft_host_handle = vn_decode_apir_buffer_type_host_handle(decoder); /* *** */ From 3769bb4171fa53ecba56121b54e8266c0cc879fb Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:51:05 +0200 Subject: [PATCH 102/117] build.backend: export SDKROOT to please apple compiler ... --- build.backend.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.backend.sh b/build.backend.sh index 863f98e3524a3..dc0b6007e3123 100755 --- a/build.backend.sh +++ b/build.backend.sh @@ -10,6 +10,8 @@ else FLAVOR="" fi +export SDKROOT=$(xcrun --sdk macosx --show-sdk-path) + if [[ "$FLAVOR" == "-prod" ]]; then cat < Date: Wed, 11 Jun 2025 09:52:53 +0200 Subject: [PATCH 103/117] prepare.backend.sh: more flags --- prepare.backend.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prepare.backend.sh b/prepare.backend.sh index caed8223382e9..8bc5be19e9343 100755 --- a/prepare.backend.sh +++ b/prepare.backend.sh @@ -8,6 +8,8 @@ cmake -S . -B ../build.remoting-backend$FLAVOR \ -DGGML_REMOTINGBACKEND=ON \ -DGGML_NATIVE=OFF \ -DGGML_METAL=ON \ + -DGGML_BACKEND_DL=OFF \ + -DLLAMA_CURL=OFF \ -DGGML_VULKAN=OFF -DVulkan_INCLUDE_DIR=/opt/homebrew/include/ -DVulkan_LIBRARY=/opt/homebrew/lib/libMoltenVK.dylib \ "$@" From 7ef077e303ca3dbc44411067703d38a15dd235e1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:53:32 +0200 Subject: [PATCH 104/117] run.vulkan.sh: more flexible --- run.vulkan.sh | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/run.vulkan.sh b/run.vulkan.sh index 1cd38ea58ef52..a84d4831d478f 100755 --- a/run.vulkan.sh +++ b/run.vulkan.sh @@ -1,10 +1,23 @@ #! /bin/bash - -if [[ ${1:-} == "gdb" ]]; then +if [[ ${1:-} == "strace" ]]; then + prefix="strace" +elif [[ ${1:-} == "gdb" ]]; then prefix="gdb --args" else prefix="" fi -export VN_DEBUG=init -$prefix ../build.vulkan/bin/llama-run --ngl 99 --verbose ~/models/llama3.2 "say nothing" +rm -f /usr/lib64/libvulkan_virtio.so + +ICD_DIR=/Users/kevinpouget/.local/share/vulkan/icd.d + +USE_WORK_MESA=1 +if [[ "$USE_WORK_MESA" == 1 ]]; then + export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.aarch64.json +else + export VK_ICD_FILENAMES=$ICD_DIR/virtio_icd.good.aarch64.json +fi + +# init result vtest wsi no_abort log_ctx_info cache no_sparse no_gpl +export VN_DEBUG=vtest +$prefix ../build.vulkan/bin/llama-run --verbose ~/models/llama3.2 "say nothing" From 6d98572ea3b3b2a2824270b44634e49fd2159cf6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:53:44 +0200 Subject: [PATCH 105/117] run.remoting.sh: more flexible --- run.remoting.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/run.remoting.sh b/run.remoting.sh index 9a2a77f054210..9a8ce4d34c74a 100755 --- a/run.remoting.sh +++ b/run.remoting.sh @@ -1,6 +1,8 @@ #! /bin/bash #clear -if [[ ${1:-} == "gdb" ]]; then +if [[ ${1:-} == "strace" ]]; then + prefix="strace" +elif [[ ${1:-} == "gdb" ]]; then prefix="gdb --args" else prefix="" @@ -41,7 +43,7 @@ if [[ "$bench" == yes ]]; then --n-gpu-layers 99 else PROMPT="say nothing" - PROMPT="tell what's Apple metal API" + #PROMPT="tell what's Apple metal API" $prefix \ $LLAMA_BUILD_DIR/bin/llama-run \ --ngl 99 \ From 50326201f9fba299d6c11d5ea3cad026b9110239 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:54:00 +0200 Subject: [PATCH 106/117] prepare.vulkan.sh: more details --- prepare.vulkan.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/prepare.vulkan.sh b/prepare.vulkan.sh index 29d0794ebe4e3..7bacf9b21a9ca 100644 --- a/prepare.vulkan.sh +++ b/prepare.vulkan.sh @@ -1 +1,6 @@ -cmake -S . -B ../build.vulkan -DGGML_VULKAN=ON -DGGML_NATIVE=OFF -DGGML_METAL=OFF +cmake -S . \ + -B ../build.vulkan \ + -DGGML_VULKAN=ON \ + -DGGML_NATIVE=OFF \ + -DGGML_METAL=OFF \ + -DCMAKE_BUILD_TYPE=Debug From eeba619c63bfce4093fc66dac093f06e99ed8664 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:55:45 +0200 Subject: [PATCH 107/117] ggml: src: ggml-remotingfrontend/virtgpu: don't include virglrenderer_hw.h --- ggml/src/ggml-remotingfrontend/virtgpu.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 26933c8a6eda4..32ad51237037c 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -18,9 +18,18 @@ #define VIRGL_RENDERER_UNSTABLE_APIS 1 #include "drm-uapi/virtgpu_drm.h" -#include "virglrenderer_hw.h" #include "venus_hw.h" +// must match https://gitlab.freedesktop.org/kpouget/virglrenderer/-/blob/main/src/virglrenderer_hw.h?ref_type=heads +enum virgl_renderer_capset { + VIRGL_RENDERER_CAPSET_VIRGL = 1, + VIRGL_RENDERER_CAPSET_VIRGL2 = 2, + /* 3 is reserved for gfxstream */ + VIRGL_RENDERER_CAPSET_VENUS = 4, + /* 5 is reserved for cross-domain */ + VIRGL_RENDERER_CAPSET_DRM = 6, +}; + /* from src/virtio/vulkan/vn_renderer_virtgpu.c */ #define VIRTGPU_PCI_VENDOR_ID 0x1af4 #define VIRTGPU_PCI_DEVICE_ID 0x1050 From 66b34d685eea20ada00ad6f2a61d36fd5cea1939 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 09:58:42 +0200 Subject: [PATCH 108/117] ggml: src: ggml-remotingfrontend/virtgpu: don't use absolute paths in include --- ggml/src/ggml-remotingfrontend/virtgpu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu.h b/ggml/src/ggml-remotingfrontend/virtgpu.h index 32ad51237037c..9d8668c3d070e 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu.h @@ -11,8 +11,8 @@ #include "virtgpu-forward.h" #include "virtgpu-utils.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/api_remoting.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs.h" +#include "../ggml-remotingbackend/shared/api_remoting.h" +#include "../ggml-remotingbackend/shared/venus_cs.h" #include "virtgpu-shm.h" From 5b5ffec30bcd2592a8621aa67db82472b573b19e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 14:13:02 +0200 Subject: [PATCH 109/117] remoting: rewrite to avoid hard-coded paths --- ggml/src/ggml-remotingbackend/backend.cpp | 45 +- .../shared/api_remoting.h | 1 - .../shared/apir_backend.h | 16 +- ggml/src/ggml-remotingfrontend/CMakeLists.txt | 10 +- .../include/drm-uapi/drm.h | 1408 +++++++++++++++++ .../include/drm-uapi/virtgpu_drm.h | 276 ++++ .../ggml-remotingfrontend/include/venus_hw.h | 74 + .../virtgpu-forward-impl.h | 4 +- .../ggml-remotingfrontend/virtgpu-forward.h | 2 +- 9 files changed, 1794 insertions(+), 42 deletions(-) create mode 100644 ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h create mode 100644 ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h create mode 100644 ggml/src/ggml-remotingfrontend/include/venus_hw.h diff --git a/ggml/src/ggml-remotingbackend/backend.cpp b/ggml/src/ggml-remotingbackend/backend.cpp index f5a10c234644a..95dee556cff3f 100644 --- a/ggml/src/ggml-remotingbackend/backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend.cpp @@ -10,17 +10,10 @@ #include "shared/apir_backend.h" #include "shared/venus_cs.h" -#define USE_METAL 1 - -#if USE_METAL -#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-metal.dylib" -#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_metal_reg" -#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_metal_init" -#else -#define GGML_BACKEND_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend/bin/libggml-vulkan.dylib" -#define GGML_BACKEND_REG_FCT_NAME "ggml_backend_vk_reg" -#define GGML_BACKEND_INIT_FCT_NAME "ggml_backend_vk_init" -#endif +#define GGML_BACKEND_LIBRARY_PATH_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_PATH" +#define GGML_BACKEND_LIBRARY_REG_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_REG" +#define GGML_BACKEND_LIBRARY_INIT_ENV "APIR_LLAMA_CPP_GGML_LIBRARY_INIT" + static void *backend_library_handle = NULL; @@ -52,9 +45,19 @@ extern "C" { uint32_t apir_backend_initialize() { const char* dlsym_error; - INFO("%s: hello " GGML_BACKEND_REG_FCT_NAME " :wave: \\o/", __func__); + const char* library_name = getenv(GGML_BACKEND_LIBRARY_PATH_ENV); + const char* library_reg = getenv(GGML_BACKEND_LIBRARY_REG_ENV); + const char* library_init = getenv(GGML_BACKEND_LIBRARY_INIT_ENV); + + INFO("%s: loading %s (%s|%s)", __func__, library_name, library_reg, library_init); + + if (!library_name) { + ERROR("Cannot open library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_PATH_ENV); - backend_library_handle = dlopen(GGML_BACKEND_LIBRARY_PATH, RTLD_LAZY); + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + backend_library_handle = dlopen(library_name, RTLD_LAZY); if (!backend_library_handle) { ERROR("Cannot open library: %s\n", dlerror()); @@ -62,7 +65,13 @@ extern "C" { return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; } - void *ggml_backend_reg_fct = dlsym(backend_library_handle, GGML_BACKEND_REG_FCT_NAME); + if (!library_reg) { + ERROR("Cannot register library: env var '%s' not defined\n", GGML_BACKEND_LIBRARY_REG_ENV); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + void *ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); dlsym_error = dlerror(); if (dlsym_error) { ERROR("Cannot load symbol: %s\n", dlsym_error); @@ -70,7 +79,13 @@ extern "C" { return APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS; } - void *ggml_backend_init_fct = dlsym(backend_library_handle, GGML_BACKEND_INIT_FCT_NAME); + if (!library_init) { + ERROR("Cannot initialize library: env var '%s' not defined\n", library_init); + + return APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY; + } + + void *ggml_backend_init_fct = dlsym(backend_library_handle, library_init); dlsym_error = dlerror(); if (dlsym_error) { ERROR("Cannot load symbol: %s\n", dlsym_error); diff --git a/ggml/src/ggml-remotingbackend/shared/api_remoting.h b/ggml/src/ggml-remotingbackend/shared/api_remoting.h index 1df5498c29c03..6e594a8ae4ab8 100644 --- a/ggml/src/ggml-remotingbackend/shared/api_remoting.h +++ b/ggml/src/ggml-remotingbackend/shared/api_remoting.h @@ -1,4 +1,3 @@ - #define VIRGL_APIR_COMMAND_TYPE_LoadLibrary 255 #define VIRGL_APIR_COMMAND_TYPE_Forward 256 diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index efd0803a929d5..4146908813c6d 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -1,10 +1,5 @@ #pragma once -#define APIR_LIBRARY_PATH "/Users/kevinpouget/remoting/llama_cpp/build.remoting-backend-prod/bin/libggml-remotingbackend.dylib" -#define APIR_INITIALIZE_FCT_NAME "apir_backend_initialize" -#define APIR_DEINIT_FCT_NAME "apir_backend_deinit" -#define APIR_DISPATCH_FCT_NAME "apir_backend_dispatcher" - #define APIR_BACKEND_INITIALIZE_SUCCESSS 0 #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_BACKEND_LIBRARY 1 #define APIR_BACKEND_INITIALIZE_CANNOT_OPEN_GGML_LIBRARY 2 @@ -24,18 +19,9 @@ typedef struct { apir_buffer_type_host_handle_t buft_host_handle; } apir_buffer_context_t; -typedef uint32_t (*apir_backend_initialize_t)(void); -typedef void (*apir_backend_deinit_t)(void); - struct vn_dispatch_context; struct virgl_apir_context; -typedef uint32_t (*apir_backend_dispatch_t)(uint32_t cmd_type, struct virgl_apir_context *ctx, - char *dec_cur, const char *dec_end, - char *enc_cur, const char *enc_end, - char **enc_cur_after - ); - typedef enum ApirBackendCommandType { /* device */ APIR_COMMAND_TYPE_DEVICE_GET_COUNT = 0, @@ -72,7 +58,7 @@ typedef enum ApirBackendCommandType { struct virgl_apir_callbacks { void *(*get_shmem_ptr)(struct vn_dispatch_context *ctx, uint32_t res_id); -} ; +}; struct virgl_apir_context { struct vn_dispatch_context *virgl_ctx; diff --git a/ggml/src/ggml-remotingfrontend/CMakeLists.txt b/ggml/src/ggml-remotingfrontend/CMakeLists.txt index 15b338f730176..f3f3dea652cf9 100644 --- a/ggml/src/ggml-remotingfrontend/CMakeLists.txt +++ b/ggml/src/ggml-remotingfrontend/CMakeLists.txt @@ -22,17 +22,11 @@ ggml_add_backend_library(ggml-remotingfrontend venus_cs_ggml-rpc-front.cpp ) +# dnf install -y libdrm-devel target_link_libraries(ggml-remotingfrontend PUBLIC drm) target_include_directories(ggml-remotingfrontend PUBLIC /usr/include/libdrm/) +target_include_directories(ggml-remotingfrontend PUBLIC ./include) -set(REMOTING_PROJECT /Users/kevinpouget/remoting) -set(MESA_PROJECT_HOME ${REMOTING_PROJECT}/mesa) -set(MESA_PROJECT_SRC ${MESA_PROJECT_HOME}/src) - -target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_SRC}/virtio/virtio-gpu/) -target_include_directories(ggml-remotingfrontend PUBLIC ${MESA_PROJECT_HOME}/include) target_include_directories(ggml-remotingfrontend PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(ggml-remotingfrontend PRIVATE -std=c++20) - -# dnf install -y libdrm-devel diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h new file mode 100644 index 0000000000000..4e4f7c2c39e4f --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/drm.h @@ -0,0 +1,1408 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void *virt; +#else + void *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int *send_indices; /**< List of handles to buffers */ + int *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int *request_indices; /**< Buffer information */ + int *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h new file mode 100644 index 0000000000000..9debb320c34be --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/drm-uapi/virtgpu_drm.h @@ -0,0 +1,276 @@ +/* + * Copyright 2013 Red Hat + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef VIRTGPU_DRM_H +#define VIRTGPU_DRM_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* Please note that modifications to all structs defined here are + * subject to backwards-compatibility constraints. + * + * Do not use pointers, use __u64 instead for 32 bit / 64 bit user/kernel + * compatibility Keep fields aligned to their size + */ + +#define DRM_VIRTGPU_MAP 0x01 +#define DRM_VIRTGPU_EXECBUFFER 0x02 +#define DRM_VIRTGPU_GETPARAM 0x03 +#define DRM_VIRTGPU_RESOURCE_CREATE 0x04 +#define DRM_VIRTGPU_RESOURCE_INFO 0x05 +#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06 +#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07 +#define DRM_VIRTGPU_WAIT 0x08 +#define DRM_VIRTGPU_GET_CAPS 0x09 +#define DRM_VIRTGPU_RESOURCE_CREATE_BLOB 0x0a +#define DRM_VIRTGPU_CONTEXT_INIT 0x0b + +#define VIRTGPU_EXECBUF_FENCE_FD_IN 0x01 +#define VIRTGPU_EXECBUF_FENCE_FD_OUT 0x02 +#define VIRTGPU_EXECBUF_RING_IDX 0x04 +#define VIRTGPU_EXECBUF_FLAGS (\ + VIRTGPU_EXECBUF_FENCE_FD_IN |\ + VIRTGPU_EXECBUF_FENCE_FD_OUT |\ + VIRTGPU_EXECBUF_RING_IDX |\ + 0) + +struct drm_virtgpu_map { + __u64 offset; /* use for mmap system call */ + __u32 handle; + __u32 pad; +}; + +#define VIRTGPU_EXECBUF_SYNCOBJ_RESET 0x01 +#define VIRTGPU_EXECBUF_SYNCOBJ_FLAGS ( \ + VIRTGPU_EXECBUF_SYNCOBJ_RESET | \ + 0) +struct drm_virtgpu_execbuffer_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +/* fence_fd is modified on success if VIRTGPU_EXECBUF_FENCE_FD_OUT flag is set. */ +struct drm_virtgpu_execbuffer { + __u32 flags; + __u32 size; + __u64 command; /* void* */ + __u64 bo_handles; + __u32 num_bo_handles; + __s32 fence_fd; /* in/out fence fd (see VIRTGPU_EXECBUF_FENCE_FD_IN/OUT) */ + __u32 ring_idx; /* command ring index (see VIRTGPU_EXECBUF_RING_IDX) */ + __u32 syncobj_stride; /* size of @drm_virtgpu_execbuffer_syncobj */ + __u32 num_in_syncobjs; + __u32 num_out_syncobjs; + __u64 in_syncobjs; + __u64 out_syncobjs; +}; + +#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */ +#define VIRTGPU_PARAM_CAPSET_QUERY_FIX 2 /* do we have the capset fix */ +#define VIRTGPU_PARAM_RESOURCE_BLOB 3 /* DRM_VIRTGPU_RESOURCE_CREATE_BLOB */ +#define VIRTGPU_PARAM_HOST_VISIBLE 4 /* Host blob resources are mappable */ +#define VIRTGPU_PARAM_CROSS_DEVICE 5 /* Cross virtio-device resource sharing */ +#define VIRTGPU_PARAM_CONTEXT_INIT 6 /* DRM_VIRTGPU_CONTEXT_INIT */ +#define VIRTGPU_PARAM_SUPPORTED_CAPSET_IDs 7 /* Bitmask of supported capability set ids */ +#define VIRTGPU_PARAM_EXPLICIT_DEBUG_NAME 8 /* Ability to set debug name from userspace */ + +struct drm_virtgpu_getparam { + __u64 param; + __u64 value; +}; + +/* NO_BO flags? NO resource flag? */ +/* resource flag for y_0_top */ +struct drm_virtgpu_resource_create { + __u32 target; + __u32 format; + __u32 bind; + __u32 width; + __u32 height; + __u32 depth; + __u32 array_size; + __u32 last_level; + __u32 nr_samples; + __u32 flags; + __u32 bo_handle; /* if this is set - recreate a new resource attached to this bo ? */ + __u32 res_handle; /* returned by kernel */ + __u32 size; /* validate transfer in the host */ + __u32 stride; /* validate transfer in the host */ +}; + +struct drm_virtgpu_resource_info { + __u32 bo_handle; + __u32 res_handle; + __u32 size; + __u32 blob_mem; +}; + +struct drm_virtgpu_3d_box { + __u32 x; + __u32 y; + __u32 z; + __u32 w; + __u32 h; + __u32 d; +}; + +struct drm_virtgpu_3d_transfer_to_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +struct drm_virtgpu_3d_transfer_from_host { + __u32 bo_handle; + struct drm_virtgpu_3d_box box; + __u32 level; + __u32 offset; + __u32 stride; + __u32 layer_stride; +}; + +#define VIRTGPU_WAIT_NOWAIT 1 /* like it */ +struct drm_virtgpu_3d_wait { + __u32 handle; /* 0 is an invalid handle */ + __u32 flags; +}; + +#define VIRTGPU_DRM_CAPSET_VIRGL 1 +#define VIRTGPU_DRM_CAPSET_VIRGL2 2 +#define VIRTGPU_DRM_CAPSET_GFXSTREAM_VULKAN 3 +#define VIRTGPU_DRM_CAPSET_VENUS 4 +#define VIRTGPU_DRM_CAPSET_CROSS_DOMAIN 5 +#define VIRTGPU_DRM_CAPSET_DRM 6 +struct drm_virtgpu_get_caps { + __u32 cap_set_id; + __u32 cap_set_ver; + __u64 addr; + __u32 size; + __u32 pad; +}; + +struct drm_virtgpu_resource_create_blob { +#define VIRTGPU_BLOB_MEM_GUEST 0x0001 +#define VIRTGPU_BLOB_MEM_HOST3D 0x0002 +#define VIRTGPU_BLOB_MEM_HOST3D_GUEST 0x0003 + +#define VIRTGPU_BLOB_FLAG_USE_MAPPABLE 0x0001 +#define VIRTGPU_BLOB_FLAG_USE_SHAREABLE 0x0002 +#define VIRTGPU_BLOB_FLAG_USE_CROSS_DEVICE 0x0004 + /* zero is invalid blob_mem */ + __u32 blob_mem; + __u32 blob_flags; + __u32 bo_handle; + __u32 res_handle; + __u64 size; + + /* + * for 3D contexts with VIRTGPU_BLOB_MEM_HOST3D_GUEST and + * VIRTGPU_BLOB_MEM_HOST3D otherwise, must be zero. + */ + __u32 pad; + __u32 cmd_size; + __u64 cmd; + __u64 blob_id; +}; + +#define VIRTGPU_CONTEXT_PARAM_CAPSET_ID 0x0001 +#define VIRTGPU_CONTEXT_PARAM_NUM_RINGS 0x0002 +#define VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK 0x0003 +#define VIRTGPU_CONTEXT_PARAM_DEBUG_NAME 0x0004 +struct drm_virtgpu_context_set_param { + __u64 param; + __u64 value; +}; + +struct drm_virtgpu_context_init { + __u32 num_params; + __u32 pad; + + /* pointer to drm_virtgpu_context_set_param array */ + __u64 ctx_set_params; +}; + +/* + * Event code that's given when VIRTGPU_CONTEXT_PARAM_POLL_RINGS_MASK is in + * effect. The event size is sizeof(drm_event), since there is no additional + * payload. + */ +#define VIRTGPU_EVENT_FENCE_SIGNALED 0x90000000 + +#define DRM_IOCTL_VIRTGPU_MAP \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map) + +#define DRM_IOCTL_VIRTGPU_EXECBUFFER \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\ + struct drm_virtgpu_execbuffer) + +#define DRM_IOCTL_VIRTGPU_GETPARAM \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\ + struct drm_virtgpu_getparam) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE, \ + struct drm_virtgpu_resource_create) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \ + struct drm_virtgpu_resource_info) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST, \ + struct drm_virtgpu_3d_transfer_from_host) + +#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST, \ + struct drm_virtgpu_3d_transfer_to_host) + +#define DRM_IOCTL_VIRTGPU_WAIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT, \ + struct drm_virtgpu_3d_wait) + +#define DRM_IOCTL_VIRTGPU_GET_CAPS \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \ + struct drm_virtgpu_get_caps) + +#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE_BLOB \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE_BLOB, \ + struct drm_virtgpu_resource_create_blob) + +#define DRM_IOCTL_VIRTGPU_CONTEXT_INIT \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_CONTEXT_INIT, \ + struct drm_virtgpu_context_init) + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/ggml/src/ggml-remotingfrontend/include/venus_hw.h b/ggml/src/ggml-remotingfrontend/include/venus_hw.h new file mode 100644 index 0000000000000..3ef774b8259d3 --- /dev/null +++ b/ggml/src/ggml-remotingfrontend/include/venus_hw.h @@ -0,0 +1,74 @@ +/* + * Copyright 2020 Chromium + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef VENUS_HW_H +#define VENUS_HW_H + +#include + +struct virgl_renderer_capset_venus { + uint32_t wire_format_version; + uint32_t vk_xml_version; + uint32_t vk_ext_command_serialization_spec_version; + uint32_t vk_mesa_venus_protocol_spec_version; + + /* This flag indicates render server config, and will be needed until drm + * virtio-gpu blob mem gets fixed to attach_resource before resource_map. + */ + uint32_t supports_blob_id_0; + + /* Extension number N, where N is defined by the Vulkan spec, corresponds + * to bit [N / 32] & (1 << N % 32). The below mask1 covers the first 1023 + * Vulkan extensions (numbered from 1 to 1023). + * + * Bit (mask1[0] & 0x1) is used for backward compatibility purpose. When + * that bit is set, the extension mask(s) are valid. Otherwise, all the + * extensions are assumed to be supported by the renderer side protocol. + */ + uint32_t vk_extension_mask1[32]; + + /* The single-threaded renderer cannot afford potential blocking calls. It + * also leads to GPU lost if the wait depends on a following command. This + * capset allows such blocking calls to passthrough from the clients, and + * shifts the responsibilities to the client drivers. + */ + uint32_t allow_vk_wait_syncs; + + /* This flag indicates that the renderer supports multiple fencing + * timelines. The client driver is expected to associate each VkQueue with + * one of these timelines at queue creation by binding it with an unused + * ring_idx. Queues created without a ring_idx binding are associated to a + * shared legacy timeline. The special ring_idx==0 is reserved for CPU + * fences that are signaled by the renderer immediately upon consumption of + * the associated renderer submission. + */ + uint32_t supports_multiple_timelines; + + /* This flag indicates to the guest that hypervisor does not support memory + * pages injections and blob allocations must be done by guest from the + * dedicated heap (Host visible memory). + */ + uint32_t use_guest_vram; +}; + +#endif /* VENUS_HW_H */ diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h index a7ed708851d8f..26510b20bc479 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-impl.h @@ -1,8 +1,8 @@ #include "ggml-backend-impl.h" #include "ggml-remoting.h" #include "virtgpu.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h" +#include "../ggml-remotingbackend/shared/apir_backend.h" +#include "../ggml-remotingbackend/shared/venus_cs_ggml.h" #define CACHED // printf("INFO: ### found response in the cache %s\n", __func__)o diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h index bbe94f14300ef..cc159e071e218 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward.h @@ -4,7 +4,7 @@ #include "virtgpu-utils.h" -#include "/Users/kevinpouget/remoting/llama_cpp/src/ggml/src/ggml-remotingbackend/shared/apir_backend.h" +#include "../ggml-remotingbackend/shared/apir_backend.h" /* device */ int apir_device_get_count(struct virtgpu *gpu); From 38b13110e7c7319f296aa03b66cedceefc62e4b9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 11 Jun 2025 14:30:28 +0200 Subject: [PATCH 110/117] update the custom scripts --- build-xcframework.sh | 526 ------------------------------------------- podman_compile.sh | 5 +- prepare.remoting.sh | 2 + run.remoting.sh | 39 ++-- 4 files changed, 22 insertions(+), 550 deletions(-) delete mode 100755 build-xcframework.sh diff --git a/build-xcframework.sh b/build-xcframework.sh deleted file mode 100755 index 1b9091d288cc8..0000000000000 --- a/build-xcframework.sh +++ /dev/null @@ -1,526 +0,0 @@ -#!/bin/bash -# -# Options -IOS_MIN_OS_VERSION=16.4 -MACOS_MIN_OS_VERSION=13.3 -VISIONOS_MIN_OS_VERSION=1.0 -TVOS_MIN_OS_VERSION=16.4 - -BUILD_SHARED_LIBS=OFF -LLAMA_BUILD_EXAMPLES=OFF -LLAMA_BUILD_TESTS=OFF -LLAMA_BUILD_SERVER=OFF -GGML_METAL=ON -GGML_METAL_EMBED_LIBRARY=ON -GGML_BLAS_DEFAULT=ON -GGML_METAL_USE_BF16=ON -GGML_OPENMP=OFF - -COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" -COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" - -# Common options for all builds -COMMON_CMAKE_ARGS=( - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY="" - -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO - -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym" - -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES - -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO - -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml - -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} - -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES} - -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} - -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER} - -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY} - -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT} - -DGGML_METAL=${GGML_METAL} - -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16} - -DGGML_NATIVE=OFF - -DGGML_OPENMP=${GGML_OPENMP} -) - -check_required_tool() { - local tool=$1 - local install_message=$2 - - if ! command -v $tool &> /dev/null; then - echo "Error: $tool is required but not found." - echo "$install_message" - exit 1 - fi -} -echo "Checking for required tools..." -check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)" -check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" -check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)" -check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" - -set -e - -## Clean up previous builds -rm -rf build-apple -rm -rf build-ios-sim -rm -rf build-ios-device -rm -rf build-macos -rm -rf build-visionos -rm -rf build-visionos-sim -rm -rf build-tvos-sim -rm -rf build-tvos-device - -# Setup the xcframework build directory structure -setup_framework_structure() { - local build_dir=$1 - local min_os_version=$2 - local platform=$3 # "ios", "macos", "visionos", or "tvos" - local framework_name="llama" - - echo "Creating ${platform}-style framework structure for ${build_dir}" - - if [[ "$platform" == "macos" ]]; then - # macOS versioned structure uses versioned directories - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules - mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources - - # Create symbolic links - ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current - ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers - ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules - ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources - ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name} - - # Set header and module paths - local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/ - local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/ - else - # iOS/VisionOS/tvOS use a flat structure - mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers - mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules - - # Remove any existing structure to ensure clean build - rm -rf ${build_dir}/framework/${framework_name}.framework/Versions - - # Set header and module paths - local header_path=${build_dir}/framework/${framework_name}.framework/Headers/ - local module_path=${build_dir}/framework/${framework_name}.framework/Modules/ - fi - - # Copy all required headers (common for all platforms) - cp include/llama.h ${header_path} - cp ggml/include/ggml.h ${header_path} - cp ggml/include/ggml-alloc.h ${header_path} - cp ggml/include/ggml-backend.h ${header_path} - cp ggml/include/ggml-metal.h ${header_path} - cp ggml/include/ggml-cpu.h ${header_path} - cp ggml/include/ggml-blas.h ${header_path} - cp ggml/include/gguf.h ${header_path} - - # Create module map (common for all platforms) - cat > ${module_path}module.modulemap << EOF -framework module llama { - header "llama.h" - header "ggml.h" - header "ggml-alloc.h" - header "ggml-backend.h" - header "ggml-metal.h" - header "ggml-cpu.h" - header "ggml-blas.h" - header "gguf.h" - - link "c++" - link framework "Accelerate" - link framework "Metal" - link framework "Foundation" - - export * -} -EOF - - # Platform-specific settings for Info.plist - local platform_name="" - local sdk_name="" - local supported_platform="" - - case "$platform" in - "ios") - platform_name="iphoneos" - sdk_name="iphoneos${min_os_version}" - supported_platform="iPhoneOS" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family=' UIDeviceFamily - - 1 - 2 - ' - ;; - "macos") - platform_name="macosx" - sdk_name="macosx${min_os_version}" - supported_platform="MacOSX" - local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist" - local device_family="" - ;; - "visionos") - platform_name="xros" - sdk_name="xros${min_os_version}" - supported_platform="XRPlatform" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family="" - ;; - "tvos") - platform_name="appletvos" - sdk_name="appletvos${min_os_version}" - supported_platform="AppleTVOS" - local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" - local device_family=' UIDeviceFamily - - 3 - ' - ;; - esac - - # Create Info.plist - cat > ${plist_path} << EOF - - - - - CFBundleDevelopmentRegion - en - CFBundleExecutable - llama - CFBundleIdentifier - org.ggml.llama - CFBundleInfoDictionaryVersion - 6.0 - CFBundleName - llama - CFBundlePackageType - FMWK - CFBundleShortVersionString - 1.0 - CFBundleVersion - 1 - MinimumOSVersion - ${min_os_version} - CFBundleSupportedPlatforms - - ${supported_platform} - ${device_family} - DTPlatformName - ${platform_name} - DTSDKName - ${sdk_name} - - -EOF -} - -# Create dynamic libraries from static libraries. -combine_static_libraries() { - local build_dir="$1" - local release_dir="$2" - local platform="$3" # "ios", "macos", "visionos", or "tvos" - local is_simulator="$4" - local base_dir="$(pwd)" - local framework_name="llama" - - # Determine output path based on platform - local output_lib="" - if [[ "$platform" == "macos" ]]; then - # macOS uses versioned structure - output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}" - else - # iOS, visionOS, and tvOS use a directory flat structure - output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}" - fi - - local libs=( - "${base_dir}/${build_dir}/src/${release_dir}/libllama.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a" - "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a" - "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a" - "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a" - ) - - # Create temporary directory for processing - local temp_dir="${base_dir}/${build_dir}/temp" - mkdir -p "${temp_dir}" - - # Since we have multiple architectures libtool will find object files that do not - # match the target architecture. We suppress these warnings. - libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null - - # Determine SDK, architectures, and install_name based on platform and simulator flag. - local sdk="" - local archs="" - local min_version_flag="" - local install_name="" - - case "$platform" in - "ios") - if [[ "$is_simulator" == "true" ]]; then - sdk="iphonesimulator" - archs="arm64 x86_64" - min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}" - else - sdk="iphoneos" - archs="arm64" - min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}" - fi - install_name="@rpath/llama.framework/llama" - ;; - "macos") - sdk="macosx" - archs="arm64 x86_64" - min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}" - install_name="@rpath/llama.framework/Versions/Current/llama" - ;; - "visionos") - if [[ "$is_simulator" == "true" ]]; then - sdk="xrsimulator" - archs="arm64 x86_64" - min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator" - else - sdk="xros" - archs="arm64" - min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}" - fi - # Use flat structure for visionOS, same as iOS - install_name="@rpath/llama.framework/llama" - ;; - "tvos") - if [[ "$is_simulator" == "true" ]]; then - sdk="appletvsimulator" - archs="arm64 x86_64" - min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}" - else - sdk="appletvos" - archs="arm64" - min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}" - fi - install_name="@rpath/llama.framework/llama" - ;; - esac - - # Build architecture flags - local arch_flags="" - for arch in $archs; do - arch_flags+=" -arch $arch" - done - - # Create dynamic library - echo "Creating dynamic library for ${platform}." - xcrun -sdk $sdk clang++ -dynamiclib \ - -isysroot $(xcrun --sdk $sdk --show-sdk-path) \ - $arch_flags \ - $min_version_flag \ - -Wl,-force_load,"${temp_dir}/combined.a" \ - -framework Foundation -framework Metal -framework Accelerate \ - -install_name "$install_name" \ - -o "${base_dir}/${output_lib}" - - # Platform-specific post-processing for device builds - if [[ "$is_simulator" == "false" ]]; then - if command -v vtool &>/dev/null; then - case "$platform" in - "ios") - echo "Marking binary as a framework binary for iOS..." - vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - "visionos") - echo "Marking binary as a framework binary for visionOS..." - vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - "tvos") - echo "Marking binary as a framework binary for tvOS..." - vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \ - -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" - ;; - esac - else - echo "Warning: vtool not found. Binary may not pass App Store validation." - fi - fi - - echo "Creating properly formatted dSYM..." - # Create a separate directory for dSYMs for all platforms - mkdir -p "${base_dir}/${build_dir}/dSYMs" - - # iOS and visionOS style dSYM (flat structure) - if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then - # Generate dSYM in the dSYMs directory - xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" - - # Create a copy of the binary that will be stripped - cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip" - - # Strip debug symbols from the copy - xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib" - - # Replace the original with the stripped version - mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" - else - # macOS style dSYM - # First strip debug info to a separate file - xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib" - - # Generate dSYM in the dSYMs directory - xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" - - # Replace original binary with stripped version - mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" - fi - - # Remove any automatically generated dSYM files in the framework structure as they will - # otherwise case Invalid Bundle Structure validation errors. - if [ -d "${base_dir}/${output_lib}.dSYM" ]; then - echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM" - rm -rf "${base_dir}/${output_lib}.dSYM" - fi - - # Clean up - rm -rf "${temp_dir}" -} - -echo "Building for iOS simulator..." -cmake -B build-ios-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ - -DIOS=ON \ - -DCMAKE_SYSTEM_NAME=iOS \ - -DCMAKE_OSX_SYSROOT=iphonesimulator \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-ios-sim --config Release -- -quiet - -echo "Building for iOS devices..." -cmake -B build-ios-device -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_SYSROOT=iphoneos \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-ios-device --config Release -- -quiet - -echo "Building for macOS..." -cmake -B build-macos -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-macos --config Release -- -quiet - -echo "Building for visionOS..." -cmake -B build-visionos -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_SYSROOT=xros \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-visionos --config Release -- -quiet - -echo "Building for visionOS simulator..." -cmake -B build-visionos-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DCMAKE_SYSTEM_NAME=visionOS \ - -DCMAKE_OSX_SYSROOT=xrsimulator \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ - -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-visionos-sim --config Release -- -quiet - -# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS) -echo "Building for tvOS simulator..." -cmake -B build-tvos-sim -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_SYSROOT=appletvsimulator \ - -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ - -DGGML_METAL=ON \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-tvos-sim --config Release -- -quiet - -echo "Building for tvOS devices..." -cmake -B build-tvos-device -G Xcode \ - "${COMMON_CMAKE_ARGS[@]}" \ - -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ - -DCMAKE_SYSTEM_NAME=tvOS \ - -DCMAKE_OSX_SYSROOT=appletvos \ - -DCMAKE_OSX_ARCHITECTURES="arm64" \ - -DGGML_METAL=ON \ - -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \ - -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ - -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ - -DLLAMA_CURL=OFF \ - -S . -cmake --build build-tvos-device --config Release -- -quiet - -# Setup frameworks and copy binaries and headers -echo "Setting up framework structures..." -setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios" -setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios" -setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos" -setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos" -setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos" -setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos" -setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos" - -# Create dynamic libraries from static libraries -echo "Creating dynamic libraries from static libraries..." -combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true" -combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false" -combine_static_libraries "build-macos" "Release" "macos" "false" -combine_static_libraries "build-visionos" "Release-xros" "visionos" "false" -combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true" -combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true" -combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false" - -# Create XCFramework with correct debug symbols paths -echo "Creating XCFramework..." -xcodebuild -create-xcframework \ - -framework $(pwd)/build-ios-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \ - -framework $(pwd)/build-ios-device/framework/llama.framework \ - -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \ - -framework $(pwd)/build-macos/framework/llama.framework \ - -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \ - -framework $(pwd)/build-visionos/framework/llama.framework \ - -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \ - -framework $(pwd)/build-visionos-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \ - -framework $(pwd)/build-tvos-device/framework/llama.framework \ - -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \ - -framework $(pwd)/build-tvos-sim/framework/llama.framework \ - -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \ - -output $(pwd)/build-apple/llama.xcframework diff --git a/podman_compile.sh b/podman_compile.sh index ec243f75ee89f..de9e5c88d57a7 100755 --- a/podman_compile.sh +++ b/podman_compile.sh @@ -10,7 +10,7 @@ opts="" opts="$opts --device /dev/dri " echo "Running with the GPU passthrough" -image=localhost/pytorch:remoting +IMAGE=quay.io/ramalama/remoting:latest what=${1:-} if [[ -z "$what" ]]; then @@ -30,9 +30,10 @@ podman run \ --security-opt label=disable \ --env HOME="$HOME" \ --env PERF_MODE="${PERF_MODE:-}" \ +--env BENCH_MODE="${BENCH_MODE:-}" \ -v "$HOME":"$HOME":Z \ -w "$PWD" \ -it --rm \ $opts \ -$image \ +$IMAGE \ $cmd diff --git a/prepare.remoting.sh b/prepare.remoting.sh index aebb75c031422..5ab73470477b1 100755 --- a/prepare.remoting.sh +++ b/prepare.remoting.sh @@ -2,5 +2,7 @@ cmake -S . -B ../build.remoting-frontend \ -DGGML_REMOTINGFRONTEND=ON \ -DGGML_CPU_ARM_ARCH=native \ -DGGML_NATIVE=OFF \ + -DGGML_OPENMP=OFF \ + -DLLAMA_CURL=OFF \ -DCMAKE_BUILD_TYPE=Debug \ "$@" diff --git a/run.remoting.sh b/run.remoting.sh index 9a8ce4d34c74a..017f3fe58c9ff 100755 --- a/run.remoting.sh +++ b/run.remoting.sh @@ -8,39 +8,34 @@ else prefix="" fi -if [[ "${PERF_MODE:-}" ]]; then - FLAVOR="-prod" -else - FLAVOR="" -fi - MODEL=${MODEL:-llama3.2} -if [[ "$FLAVOR" == "-prod" ]]; then - cat < Date: Tue, 17 Jun 2025 17:40:56 +0200 Subject: [PATCH 111/117] ggml: src: ggml-remotingfrontend/virtgpu-shm: import the cpp atomic --- ggml/src/ggml-remotingfrontend/virtgpu-shm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h index e5770b1916886..52217f5b7e857 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-shm.h +++ b/ggml/src/ggml-remotingfrontend/virtgpu-shm.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include "virtgpu.h" From 65b92b9ad6c642eed6fc6e9e3cb8059cc435d018 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 15:14:57 +0200 Subject: [PATCH 112/117] remoting: reintroduce the support for support_op(tensor) --- .../backend-dispatched-device.cpp | 2 +- .../shared/apir_backend.h | 3 + .../shared/venus_cs_ggml.h | 69 +++++++++++++++++++ .../ggml-backend-device.cpp | 7 -- .../src/ggml-remotingfrontend/ggml-remoting.h | 4 ++ .../venus_cs_ggml-rpc-front.cpp | 9 ++- .../virtgpu-forward-device.cpp | 4 +- 7 files changed, 86 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp index 5bf0788ccf864..473e9d2db7089 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp @@ -73,7 +73,7 @@ uint32_t backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) { UNUSED(ctx); - const ggml_tensor *op = vn_decode_ggml_tensor(dec); + const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec); bool supports_op = dev->iface.supports_op(dev, op); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 4146908813c6d..8125a30e386e4 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -9,6 +9,9 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 +// 1 is fast, 0 avoid micro-benchmark crashes +#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0 + typedef uintptr_t apir_buffer_type_host_handle_t; typedef uintptr_t apir_buffer_host_handle_t; diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h index 71e15f847e851..71c9b3f3ed820 100644 --- a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h +++ b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h @@ -165,3 +165,72 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) { return deserialize_graph(n_nodes, n_tensors, tensors, nodes); } + +static inline void +vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) { + vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle)); +} + +static inline void +vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) { + size_t tensor_size = sizeof(*tensor); + + if (tensor->extra) { + FATAL("Cannot pass tensors with extra"); + } + + if (tensor->src[0] && tensor->buffer) { + static int first = 1; + if (first) { + // not sure if the buffer needs to be updated inside the src tensors or not + WARNING("Cannot pass tensors with src and buffer"); + first = 0; + } + } + + vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence. + // (could also make a copy of the tensor, and update locally.) + + if (tensor->buffer) { + apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer); + vn_encode_ggml_buffer_handle(enc, &buffer_handle); + } + + if (tensor->view_src) { + vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size); + } + + for (int i = 0; tensor->src[i]; i++) { + const ggml_tensor *tensor_src = tensor->src[i]; + vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size); + } +} + +static inline const ggml_tensor * +vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) { + + // it safe to remove the `const` qualifier here, we *do* want to + // modify the shared memory data to fix the `src` pointers. + ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + + // tensor->data is a pointer inside the device buffer. No need to touch it + // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence. + if (tensor->buffer) { + tensor->buffer = vn_decode_ggml_buffer(dec); + } + + if (tensor->view_src) { + ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->view_src = tensor_view_src; + } + + for (int i = 0; tensor->src[i]; i++) { + ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor)); + tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor + } + + return tensor; +} diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index dfe1e992c9dac..1fa661e3b60d6 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -38,16 +38,9 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { -#if 1 - UNUSED(dev); - UNUSED(op); - - return true; // same as ggml-rpc -#else struct virtgpu *gpu = DEV_TO_GPU(dev); return apir_device_supports_op(gpu, op); -#endif } static bool diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h index 18b880c740564..cd58ed674475d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-remoting.h +++ b/ggml/src/ggml-remotingfrontend/ggml-remoting.h @@ -126,3 +126,7 @@ struct remoting_context_struct { }; typedef std::shared_ptr remoting_context; typedef std::weak_ptr remoting_context_ref; + +static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { + return BUFFER_TO_HOST_HANDLE(buffer); +} diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp index 67b8c37748aa8..53c42730fad06 100644 --- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -40,8 +40,13 @@ serialize_tensor(const ggml_tensor * tensor) { result.view_src = reinterpret_cast(tensor->view_src); result.view_offs = tensor->view_offs; result.data = reinterpret_cast(tensor->data); - // tensor->data is serialized as an offset to the buffer base address - result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); + if (tensor->data) { + if (!tensor->buffer) { + FATAL("tensor has data but not buffer :/"); + } + // tensor->data is serialized as an offset to the buffer base address + result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); + } snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name); return result; } diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp index 06ad6d445de4c..ca036366a6752 100644 --- a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp @@ -135,7 +135,7 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) { bool apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { -#if 1 +#if APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE /* ggml-rpc cheats it like this */ /* with the current implementation of serialize_tensor, the src/view aren't properly passed */ UNUSED(gpu); @@ -147,7 +147,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) { struct vn_cs_decoder *decoder; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP); - vn_encode_ggml_tensor(encoder, op); + vn_encode_ggml_tensor_inline(encoder, op); REMOTE_CALL(gpu, encoder, decoder); From 34e68b5df1e073494796efc3542a56d72b186520 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 15:15:39 +0200 Subject: [PATCH 113/117] remotingbackend: add an optional call to support_op to avoid crashing the backend if the tensor is not supported --- .../backend-dispatched-backend.cpp | 15 +++++++++++++++ .../ggml-remotingbackend/shared/apir_backend.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp index 6e600843a48db..f15f39c7f92d8 100644 --- a/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-remotingbackend/backend-dispatched-backend.cpp @@ -32,6 +32,21 @@ backend_graph_compute(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, stru ggml_cgraph *cgraph = vn_decode_ggml_cgraph(&secondary_dec, cgraph_size); ggml_status status; +#if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 + for (int idx = 0; idx < cgraph->n_nodes; idx++) { + ggml_tensor *op = ggml_graph_node(cgraph, idx); + if (dev->iface.supports_op(dev, op)) { + continue; + } + ERROR("Graph node %d (%s) not supported by the backend :/", idx, ggml_op_desc(op)); + + status = GGML_STATUS_ABORTED; + vn_encode_ggml_status(enc, &status); + + stop_timer(&graph_compute_timer); + return 0; + } +#endif status = bck->iface.graph_compute(bck, cgraph); vn_encode_ggml_status(enc, &status); diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 8125a30e386e4..6d44108ef7a61 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -12,6 +12,9 @@ // 1 is fast, 0 avoid micro-benchmark crashes #define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0 +// 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received +#define APIR_BACKEND_CHECK_SUPPORTS_OP 0 + typedef uintptr_t apir_buffer_type_host_handle_t; typedef uintptr_t apir_buffer_host_handle_t; From 1d4bbef12db5be71b6ac82203a582b99a539b7dd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 15:16:19 +0200 Subject: [PATCH 114/117] remotingfrontend: reduce and cleanup the logging --- .../ggml-remotingfrontend/ggml-backend-buffer-type.cpp | 10 +++------- ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp | 3 +-- .../ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp | 4 ---- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index 70fc829c24fa4..eb4e3b2940721 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -5,7 +5,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); struct ggml_backend_remoting_buffer_context *context = (struct ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); @@ -29,9 +29,6 @@ ggml_backend_remoting_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, context->is_host_buffer = false; ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_remoting_buffer_interface, (void *) context, size); - INFO("##"); - INFO("## %s(%llx) --> %p <---------------", __func__, size, buffer); - INFO("##\n"); return buffer; } @@ -47,8 +44,7 @@ ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - IMPLEMENTED; - + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); return apir_buffer_type_get_alignment(gpu, buft); @@ -56,7 +52,7 @@ ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - IMPLEMENTED; + IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); return apir_buffer_type_get_max_size(gpu, buft); diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index 1fa661e3b60d6..b17b43cd8d55f 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -20,8 +20,7 @@ ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { - IMPLEMENTED; - + IMPLEMENTED_ONCE; struct virtgpu *gpu = DEV_TO_GPU(dev); return (enum ggml_backend_dev_type) apir_device_get_type(gpu); diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp index 53c42730fad06..7ce0dbb7fbc67 100644 --- a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp +++ b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp @@ -18,10 +18,6 @@ serialize_tensor(const ggml_tensor * tensor) { ggml_backend_buffer_t buffer = tensor->buffer; result.buffer = BUFFER_TO_HOST_HANDLE(buffer); - if (result.buffer < 0x600000000000 || result.buffer > 0x700000000000) { - INFO("pass buffer handle %p", result.buffer); - BREAKPOINT; - } } else { result.buffer = 0; } From 67d00e7b6f60220341ec841456571f2fe65424de Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 15:18:11 +0200 Subject: [PATCH 115/117] remotingfrontend: cache some values --- .../ggml-backend-buffer-type.cpp | 15 +++++++++++++-- .../ggml-remotingfrontend/ggml-backend-device.cpp | 9 ++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp index eb4e3b2940721..b655b8018f80d 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-buffer-type.cpp @@ -47,7 +47,13 @@ ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); - return apir_buffer_type_get_alignment(gpu, buft); + static size_t align = 0; + + if (align == 0) { + align = apir_buffer_type_get_alignment(gpu, buft); + } + + return align; } static size_t @@ -55,7 +61,12 @@ ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) IMPLEMENTED_ONCE; struct virtgpu *gpu = BUFT_TO_GPU(buft); - return apir_buffer_type_get_max_size(gpu, buft); + static size_t max_size = 0; + if (max_size == 0) { + max_size = apir_buffer_type_get_max_size(gpu, buft); + } + + return max_size; } static bool diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp index b17b43cd8d55f..6f498d0edc2e4 100644 --- a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp +++ b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp @@ -23,7 +23,14 @@ ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { IMPLEMENTED_ONCE; struct virtgpu *gpu = DEV_TO_GPU(dev); - return (enum ggml_backend_dev_type) apir_device_get_type(gpu); + static enum ggml_backend_dev_type type; + static bool has_type = false; + if (!has_type) { + has_type = true; + type = (enum ggml_backend_dev_type) apir_device_get_type(gpu); + } + + return type; } static void From a6186a1c86fd72cad460648ed6c924e1694717c5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 17:33:03 +0200 Subject: [PATCH 116/117] Update the custom scripts --- build.backend.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/build.backend.sh b/build.backend.sh index dc0b6007e3123..2904c4a15c73f 100755 --- a/build.backend.sh +++ b/build.backend.sh @@ -20,8 +20,14 @@ if [[ "$FLAVOR" == "-prod" ]]; then EOF fi -WHAT="llama-run llama-bench" -cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $WHAT "$@" +TARGETS="llama-run" +if [[ "${BENCH_MODE:-}" == "bench" ]]; then + TARGETS="$TARGETS llama-bench" +elif [[ "${BENCH_MODE:-}" == "perf" ]]; then + TARGETS="$TARGETS test-backend-ops" +fi + +cmake --build ../build.remoting-backend$FLAVOR --parallel 8 --target $TARGETS "$@" if [[ $? == 0 ]]; then touch READY_backend From 61a6bdd4ae64e4402c4d2ca4a035636c3b4ab928 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 19 Jun 2025 17:33:27 +0200 Subject: [PATCH 117/117] remotingbackend: set APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE = 1 --- ggml/src/ggml-remotingbackend/shared/apir_backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h index 6d44108ef7a61..80e5961ff04b5 100644 --- a/ggml/src/ggml-remotingbackend/shared/apir_backend.h +++ b/ggml/src/ggml-remotingbackend/shared/apir_backend.h @@ -10,7 +10,7 @@ #define APIR_BACKEND_FORWARD_INDEX_INVALID 6 // 1 is fast, 0 avoid micro-benchmark crashes -#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0 +#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 1 // 0 is fast, 1 avoids the backend to crash if an unsupported tensor is received #define APIR_BACKEND_CHECK_SUPPORTS_OP 0