From 3c8f676d0df1cc3891a3356c9765dee39de5c15f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:28:05 -0700 Subject: [PATCH] [ET-VK] 5/n Split dispatches between multiple command buffers. Add support to defer command buffers in encode_execute function and submit all deferred commands in execute function. Pull Request resolved: https://github.com/pytorch/executorch/pull/12527 The diff adds changes to store command buffers for deferred submission. Storing these buffers is necessary for `execute()` function. Since, `encode_execute()` function is typically called once but `execute()` can be called multiple times, `submit_deferred_cmds` function is added so all recorded command buffers can be called multiple times in `execute()`. ghstack-source-id: 297107413 @exported-using-ghexport Differential Revision: [D78360038](https://our.internmc.facebook.com/intern/diff/D78360038/) --- backends/vulkan/runtime/api/Context.cpp | 2 +- backends/vulkan/runtime/api/Context.h | 8 ++- .../vulkan/runtime/graph/ComputeGraph.cpp | 56 +++++++++++++++++-- backends/vulkan/runtime/graph/ComputeGraph.h | 22 ++++++++ 4 files changed, 81 insertions(+), 7 deletions(-) diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index 64d940d44fb..44804b1c86e 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -217,7 +217,7 @@ void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) { } void Context::flush() { - VK_CHECK(vkQueueWaitIdle(queue())); + VK_CHECK(vkQueueWaitIdle(queue().handle)); command_pool_.flush(); descriptor_pool_.flush(); diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h index 9d8e7c92255..3efa8d0276d 100644 --- a/backends/vulkan/runtime/api/Context.h +++ b/backends/vulkan/runtime/api/Context.h @@ -90,8 +90,8 @@ class Context final { return device_; } - inline VkQueue queue() { - return queue_.handle; + inline vkapi::Adapter::Queue& queue() { + return queue_; } // Device Caches @@ -230,6 +230,10 @@ class Context final { VkFence fence_handle = VK_NULL_HANDLE, const bool final_use = false); + vkapi::CommandBuffer& extract_cmd() { + return cmd_; + } + void flush(); #ifdef VULKAN_DEBUG diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index f4740666bea..ee5621d9c12 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -158,6 +158,7 @@ ComputeGraph::~ComputeGraph() { prepack_nodes_.clear(); execute_nodes_.clear(); + clear_deferred_cmds(); context_->flush(); } @@ -775,6 +776,53 @@ void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) { context_->fences().return_fence(fence); } +void ComputeGraph::submit_cmd( + vkapi::CommandBuffer& cmd_buf, + VkSemaphore wait_semaphore, + VkSemaphore signal_semaphore, + VkFence fence) { + if (cmd_buf) { + cmd_buf.end(); + context_->adapter_ptr()->submit_cmd( + context_->queue(), + cmd_buf.get_submit_handle(false), + fence, + wait_semaphore, + signal_semaphore); + } +} + +void ComputeGraph::submit_deferred_cmds_and_wait() { + VkSemaphore prev_semaphore = VK_NULL_HANDLE; + vkapi::VulkanFence fence = context_->fences().get_fence(); + + for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) { + auto& cmd = deferred_cmd_list_[i]; + VkSemaphore wait_semaphore = prev_semaphore; + VkSemaphore signal_semaphore = cmd.get_signal_semaphore(); + prev_semaphore = signal_semaphore; + + submit_cmd( + cmd, + wait_semaphore, + signal_semaphore, + i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle() + : VK_NULL_HANDLE); + } + fence.wait(); + context_->fences().return_fence(fence); +} + +void ComputeGraph::clear_deferred_cmds() { + for (auto& cmd : deferred_cmd_list_) { + if (cmd) { + cmd.end(); + cmd.invalidate(); + } + } + deferred_cmd_list_.clear(); +} + void ComputeGraph::prepack() { int i = 0; bool submitted = false; @@ -813,6 +861,7 @@ void ComputeGraph::prepack() { } void ComputeGraph::encode_execute() { + clear_deferred_cmds(); context_->flush(); context_->set_cmd(/*reusable = */ true); @@ -821,13 +870,12 @@ void ComputeGraph::encode_execute() { for (std::unique_ptr& node : execute_nodes_) { node->encode(this); } + + deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); } void ComputeGraph::execute() { - vkapi::VulkanFence fence = context_->fences().get_fence(); - context_->submit_cmd_to_gpu(fence.get_submit_handle()); - fence.wait(); - context_->fences().return_fence(fence); + submit_deferred_cmds_and_wait(); execute_count_++; } diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 1961f5046e2..4b1089b0de8 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -193,6 +193,9 @@ class ComputeGraph final { // Utility constexpr to express byte quantities constexpr static size_t MB = 1024 * 1024; + // List of command buffers deferred for submission + std::vector deferred_cmd_list_; + protected: size_t values_in_use_ = 0; size_t execute_count_ = 0; @@ -851,6 +854,25 @@ class ComputeGraph final { */ void submit_current_cmd_and_wait(const bool final_use = false); + /* + * Submit one command buffer to the GPU. + */ + void submit_cmd( + vkapi::CommandBuffer& cmd_buf, + VkSemaphore wait_semaphore, + VkSemaphore signal_semaphore, + VkFence fence); + + /* + * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU. + */ + void submit_deferred_cmds_and_wait(); + + /* + * Ends and invalidates all deferred commands. + */ + void clear_deferred_cmds(); + public: // // Graph Prepacking