From 463b6327660fa05687b3d5b22f555c5f557c1977 Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:41:51 -0700 Subject: [PATCH] [ET-VK] 7/n Split dispatches between multiple command buffers. Split execute dispatch into multiple commands based on dispatch count. Pull Request resolved: https://github.com/pytorch/executorch/pull/12530 This diff, splits the execute dispatch into multiple commands based on the dispatch count. This allows for concurrent CPU and GPU execution. The modifications involve adding a counter `encoded_node_count` to track the number of encoded nodes and submitting a new command buffer to the GPU every 64 nodes. ghstack-source-id: 300616853 @exported-using-ghexport Differential Revision: [D78360039](https://our.internmc.facebook.com/intern/diff/D78360039/) --- .../vulkan/runtime/graph/ComputeGraph.cpp | 29 ++++++++++++++++++- backends/vulkan/runtime/graph/GraphConfig.h | 11 +++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 14328027362..7775165bc68 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -151,6 +151,10 @@ ComputeGraph::ComputeGraph(GraphConfig config) config_.prepack_threshold_nbytes = 10 * MB; config_.prepack_initial_threshold_nbytes = 10 * MB; } + if (config_.execute_threshold_node_count == 0) { + config_.execute_threshold_node_count = 128; + config_.execute_initial_threshold_node_count = 64; + } } ComputeGraph::~ComputeGraph() { @@ -852,15 +856,38 @@ void ComputeGraph::execute() { context_->set_cmd(/*reusable = */ true); context_->cmd_reset_querypool(); + uint32_t encoded_node_count = 0; for (std::unique_ptr& node : execute_nodes_) { node->encode(this); + encoded_node_count++; + + // Threshold is reached when the node count reached + // execute_initial_threshold_node_count or if its a multiple of + // execute_threshold_node_count. + const bool reached_threshold = + encoded_node_count >= config_.execute_initial_threshold_node_count && + ((encoded_node_count - config_.execute_initial_threshold_node_count) % + config_.execute_threshold_node_count == + 0); + + // Create a new command buffer when threashold is reached + if (reached_threshold) { + context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false); + deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); + context_->set_cmd(true); + } } + vkapi::VulkanFence fence = context_->fences().get_fence(); + context_->submit_cmd_to_gpu(fence.get_submit_handle(), false); + fence.wait(); + context_->fences().return_fence(fence); deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); + } else { + submit_deferred_cmds_and_wait(); } - submit_deferred_cmds_and_wait(); execute_count_++; } diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 33c7ae73e62..08505aa3345 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -50,6 +50,17 @@ struct GraphConfig final { // by taking more advantage of parallelism between the CPU and GPU. size_t prepack_initial_threshold_nbytes = 0; + // During execute, once this node count is reached, submit the current + // command buffer for execution. This allows the work to be distributed over + // multiple command buffer submissions, which can improve execution + // performance. + size_t execute_threshold_node_count = 0; + // Execute node count used for the first command buffer submission during + // execute. This can be set to be lower than execute_threshold_nbytes to + // submit a command buffer for execution earlier which can improve performance + // by taking more advantage of parallelism between the CPU and GPU. + size_t execute_initial_threshold_node_count = 0; + vkapi::Adapter* external_adapter; // Generate a default graph config with pre-configured settings