remoting: reintroduce the support for support_op(tensor)

kpouget · kpouget · commit 65b92b9ad6c6 · 2025-06-19T15:27:57.000+02:00
diff --git a/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp b/ggml/src/ggml-remotingbackend/backend-dispatched-device.cpp
@@ -73,7 +73,7 @@ uint32_t
 backend_device_supports_op(struct vn_cs_encoder *enc, struct vn_cs_decoder *dec, struct virgl_apir_context *ctx) {
   UNUSED(ctx);
 
-  const ggml_tensor *op = vn_decode_ggml_tensor(dec);
+  const ggml_tensor *op = vn_decode_ggml_tensor_inplace(dec);
 
   bool supports_op = dev->iface.supports_op(dev, op);
 
diff --git a/ggml/src/ggml-remotingbackend/shared/apir_backend.h b/ggml/src/ggml-remotingbackend/shared/apir_backend.h
@@ -9,6 +9,9 @@
 
 #define APIR_BACKEND_FORWARD_INDEX_INVALID 6
 
+// 1 is fast, 0 avoid micro-benchmark crashes
+#define APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE 0
+
 typedef uintptr_t apir_buffer_type_host_handle_t;
 typedef uintptr_t apir_buffer_host_handle_t;
 
diff --git a/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h b/ggml/src/ggml-remotingbackend/shared/venus_cs_ggml.h
@@ -165,3 +165,72 @@ vn_decode_ggml_cgraph(struct vn_cs_decoder *dec, size_t cgraph_size) {
 
   return deserialize_graph(n_nodes, n_tensors, tensors, nodes);
 }
+
+static inline void
+vn_encode_ggml_buffer_handle(struct vn_cs_encoder *enc, const apir_buffer_host_handle_t *handle) {
+  vn_cs_encoder_write(enc, sizeof(*handle), &handle, sizeof(*handle));
+}
+
+static inline void
+vn_encode_ggml_tensor_inline(struct vn_cs_encoder *enc, const ggml_tensor *tensor) {
+  size_t tensor_size = sizeof(*tensor);
+
+  if (tensor->extra) {
+    FATAL("Cannot pass tensors with extra");
+  }
+
+  if (tensor->src[0] && tensor->buffer) {
+    static int first = 1;
+    if (first) {
+      // not sure if the buffer needs to be updated inside the src tensors or not
+      WARNING("Cannot pass tensors with src and buffer");
+      first = 0;
+    }
+  }
+
+  vn_cs_encoder_write(enc, tensor_size, tensor, tensor_size);
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Encoding the buffer handle in sequence.
+  // (could also make a copy of the tensor, and update locally.)
+
+  if (tensor->buffer) {
+    apir_buffer_host_handle_t buffer_handle = ggml_buffer_to_apir_handle(tensor->buffer);
+    vn_encode_ggml_buffer_handle(enc, &buffer_handle);
+  }
+
+  if (tensor->view_src) {
+    vn_cs_encoder_write(enc, tensor_size, tensor->view_src, tensor_size);
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    const ggml_tensor *tensor_src = tensor->src[i];
+    vn_cs_encoder_write(enc, tensor_size, tensor_src, tensor_size);
+  }
+}
+
+static inline const ggml_tensor *
+vn_decode_ggml_tensor_inplace(struct vn_cs_decoder *dec) {
+
+  // it safe to remove the `const` qualifier here, we *do* want to
+  // modify the shared memory data to fix the `src` pointers.
+  ggml_tensor *tensor = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+
+  // tensor->data is a pointer inside the device buffer. No need to touch it
+  // tensor->buffer is a pointer to a buffer. Decode the buffer handle encoded in sequence.
+  if (tensor->buffer) {
+    tensor->buffer = vn_decode_ggml_buffer(dec);
+  }
+
+  if (tensor->view_src) {
+    ggml_tensor *tensor_view_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->view_src = tensor_view_src;
+  }
+
+  for (int i = 0; tensor->src[i]; i++) {
+    ggml_tensor *tensor_src = (ggml_tensor *)(uintptr_t) vn_cs_decoder_use_inplace(dec, sizeof(ggml_tensor));
+    tensor->src[i] = tensor_src; // overwrite op->src[i] pointer with the actual location of the src tensor
+  }
+
+  return tensor;
+}
diff --git a/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp b/ggml/src/ggml-remotingfrontend/ggml-backend-device.cpp
@@ -38,16 +38,9 @@ ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, s
 
 static bool
 ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
-#if 1
-  UNUSED(dev);
-  UNUSED(op);
-
-  return true; // same as ggml-rpc
-#else
   struct virtgpu *gpu = DEV_TO_GPU(dev);
 
   return apir_device_supports_op(gpu, op);
-#endif
 }
 
 static bool
diff --git a/ggml/src/ggml-remotingfrontend/ggml-remoting.h b/ggml/src/ggml-remotingfrontend/ggml-remoting.h
@@ -126,3 +126,7 @@ struct remoting_context_struct {
 };
 typedef std::shared_ptr<remoting_context_struct> remoting_context;
 typedef std::weak_ptr<remoting_context_struct> remoting_context_ref;
+
+static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) {
+  return BUFFER_TO_HOST_HANDLE(buffer);
+}
diff --git a/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp b/ggml/src/ggml-remotingfrontend/venus_cs_ggml-rpc-front.cpp
@@ -40,8 +40,13 @@ serialize_tensor(const ggml_tensor * tensor) {
   result.view_src = reinterpret_cast<uint64_t>(tensor->view_src);
   result.view_offs = tensor->view_offs;
   result.data = reinterpret_cast<uint64_t>(tensor->data);
-  // tensor->data is serialized as an offset to the buffer base address
-  result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  if (tensor->data) {
+    if (!tensor->buffer) {
+      FATAL("tensor has data but not buffer :/");
+    }
+    // tensor->data is serialized as an offset to the buffer base address
+    result.data -= reinterpret_cast<uint64_t>(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base);
+  }
   snprintf(result.name, GGML_MAX_NAME, "%s", tensor->name);
   return result;
 }
diff --git a/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp b/ggml/src/ggml-remotingfrontend/virtgpu-forward-device.cpp
@@ -135,7 +135,7 @@ apir_device_get_memory(struct virtgpu *gpu, size_t *free, size_t *total) {
 
 bool
 apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
-#if 1
+#if APIR_DEVICE_SUPPORTS_OP_ALWAYS_TRUE
   /* ggml-rpc cheats it like this */
   /* with the current implementation of serialize_tensor, the src/view aren't properly passed */
   UNUSED(gpu);
@@ -147,7 +147,7 @@ apir_device_supports_op(struct virtgpu *gpu, const ggml_tensor *op) {
   struct vn_cs_decoder *decoder;
   REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP);
 
-  vn_encode_ggml_tensor(encoder, op);
+  vn_encode_ggml_tensor_inline(encoder, op);
 
   REMOTE_CALL(gpu, encoder, decoder);