From 3c8f676d0df1cc3891a3356c9765dee39de5c15f Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:28:05 -0700
Subject: [PATCH] [ET-VK] 5/n Split dispatches between multiple command
 buffers. Add support to defer command buffers in encode_execute function and
 submit all deferred commands in execute function.

Pull Request resolved: https://github.com/pytorch/executorch/pull/12527

The diff adds changes to store command buffers for deferred submission.

Storing these buffers is necessary for `execute()` function. Since, `encode_execute()` function is typically called once but `execute()` can be called multiple times, `submit_deferred_cmds` function is added so all recorded command buffers can be called multiple times in `execute()`.

ghstack-source-id: 297107413
@exported-using-ghexport

Differential Revision: [D78360038](https://our.internmc.facebook.com/intern/diff/D78360038/)
---
 backends/vulkan/runtime/api/Context.cpp       |  2 +-
 backends/vulkan/runtime/api/Context.h         |  8 ++-
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 56 +++++++++++++++++--
 backends/vulkan/runtime/graph/ComputeGraph.h  | 22 ++++++++
 4 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 64d940d44fb..44804b1c86e 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -217,7 +217,7 @@ void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
 }
 
 void Context::flush() {
-  VK_CHECK(vkQueueWaitIdle(queue()));
+  VK_CHECK(vkQueueWaitIdle(queue().handle));
 
   command_pool_.flush();
   descriptor_pool_.flush();
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
index 9d8e7c92255..3efa8d0276d 100644
--- a/backends/vulkan/runtime/api/Context.h
+++ b/backends/vulkan/runtime/api/Context.h
@@ -90,8 +90,8 @@ class Context final {
     return device_;
   }
 
-  inline VkQueue queue() {
-    return queue_.handle;
+  inline vkapi::Adapter::Queue& queue() {
+    return queue_;
   }
 
   // Device Caches
@@ -230,6 +230,10 @@ class Context final {
       VkFence fence_handle = VK_NULL_HANDLE,
       const bool final_use = false);
 
+  vkapi::CommandBuffer& extract_cmd() {
+    return cmd_;
+  }
+
   void flush();
 
 #ifdef VULKAN_DEBUG
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index f4740666bea..ee5621d9c12 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -158,6 +158,7 @@ ComputeGraph::~ComputeGraph() {
 
   prepack_nodes_.clear();
   execute_nodes_.clear();
+  clear_deferred_cmds();
 
   context_->flush();
 }
@@ -775,6 +776,53 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
   context_->fences().return_fence(fence);
 }
 
+void ComputeGraph::submit_cmd(
+    vkapi::CommandBuffer& cmd_buf,
+    VkSemaphore wait_semaphore,
+    VkSemaphore signal_semaphore,
+    VkFence fence) {
+  if (cmd_buf) {
+    cmd_buf.end();
+    context_->adapter_ptr()->submit_cmd(
+        context_->queue(),
+        cmd_buf.get_submit_handle(false),
+        fence,
+        wait_semaphore,
+        signal_semaphore);
+  }
+}
+
+void ComputeGraph::submit_deferred_cmds_and_wait() {
+  VkSemaphore prev_semaphore = VK_NULL_HANDLE;
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+
+  for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) {
+    auto& cmd = deferred_cmd_list_[i];
+    VkSemaphore wait_semaphore = prev_semaphore;
+    VkSemaphore signal_semaphore = cmd.get_signal_semaphore();
+    prev_semaphore = signal_semaphore;
+
+    submit_cmd(
+        cmd,
+        wait_semaphore,
+        signal_semaphore,
+        i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle()
+                                             : VK_NULL_HANDLE);
+  }
+  fence.wait();
+  context_->fences().return_fence(fence);
+}
+
+void ComputeGraph::clear_deferred_cmds() {
+  for (auto& cmd : deferred_cmd_list_) {
+    if (cmd) {
+      cmd.end();
+      cmd.invalidate();
+    }
+  }
+  deferred_cmd_list_.clear();
+}
+
 void ComputeGraph::prepack() {
   int i = 0;
   bool submitted = false;
@@ -813,6 +861,7 @@ void ComputeGraph::prepack() {
 }
 
 void ComputeGraph::encode_execute() {
+  clear_deferred_cmds();
   context_->flush();
   context_->set_cmd(/*reusable = */ true);
 
@@ -821,13 +870,12 @@ void ComputeGraph::encode_execute() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->encode(this);
   }
+
+  deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
 }
 
 void ComputeGraph::execute() {
-  vkapi::VulkanFence fence = context_->fences().get_fence();
-  context_->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-  context_->fences().return_fence(fence);
+  submit_deferred_cmds_and_wait();
   execute_count_++;
 }
 
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 1961f5046e2..4b1089b0de8 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -193,6 +193,9 @@ class ComputeGraph final {
   // Utility constexpr to express byte quantities
   constexpr static size_t MB = 1024 * 1024;
 
+  // List of command buffers deferred for submission
+  std::vector<vkapi::CommandBuffer> deferred_cmd_list_;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
@@ -851,6 +854,25 @@ class ComputeGraph final {
    */
   void submit_current_cmd_and_wait(const bool final_use = false);
 
+  /*
+   * Submit one command buffer to the GPU.
+   */
+  void submit_cmd(
+      vkapi::CommandBuffer& cmd_buf,
+      VkSemaphore wait_semaphore,
+      VkSemaphore signal_semaphore,
+      VkFence fence);
+
+  /*
+   * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU.
+   */
+  void submit_deferred_cmds_and_wait();
+
+  /*
+   * Ends and invalidates all deferred commands.
+   */
+  void clear_deferred_cmds();
+
  public:
   //
   // Graph Prepacking