From 463b6327660fa05687b3d5b22f555c5f557c1977 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:41:51 -0700
Subject: [PATCH] [ET-VK] 7/n Split dispatches between multiple command
 buffers. Split execute dispatch into multiple commands based on dispatch
 count.

Pull Request resolved: https://github.com/pytorch/executorch/pull/12530

This diff, splits the execute dispatch into multiple commands based on the dispatch count. This allows for concurrent CPU and GPU execution.
The modifications involve adding a counter `encoded_node_count` to track the number of encoded nodes and submitting a new command buffer to the GPU every 64 nodes.

ghstack-source-id: 300616853
@exported-using-ghexport

Differential Revision: [D78360039](https://our.internmc.facebook.com/intern/diff/D78360039/)
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 29 ++++++++++++++++++-
 backends/vulkan/runtime/graph/GraphConfig.h   | 11 +++++++
 2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 14328027362..7775165bc68 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -151,6 +151,10 @@ ComputeGraph::ComputeGraph(GraphConfig config)
     config_.prepack_threshold_nbytes = 10 * MB;
     config_.prepack_initial_threshold_nbytes = 10 * MB;
   }
+  if (config_.execute_threshold_node_count == 0) {
+    config_.execute_threshold_node_count = 128;
+    config_.execute_initial_threshold_node_count = 64;
+  }
 }
 
 ComputeGraph::~ComputeGraph() {
@@ -852,15 +856,38 @@ void ComputeGraph::execute() {
     context_->set_cmd(/*reusable = */ true);
 
     context_->cmd_reset_querypool();
+    uint32_t encoded_node_count = 0;
 
     for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
       node->encode(this);
+      encoded_node_count++;
+
+      // Threshold is reached when the node count reached
+      // execute_initial_threshold_node_count or if its a multiple of
+      // execute_threshold_node_count.
+      const bool reached_threshold =
+          encoded_node_count >= config_.execute_initial_threshold_node_count &&
+          ((encoded_node_count - config_.execute_initial_threshold_node_count) %
+               config_.execute_threshold_node_count ==
+           0);
+
+      // Create a new command buffer when threashold is reached
+      if (reached_threshold) {
+        context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
+        deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
+        context_->set_cmd(true);
+      }
     }
 
+    vkapi::VulkanFence fence = context_->fences().get_fence();
+    context_->submit_cmd_to_gpu(fence.get_submit_handle(), false);
+    fence.wait();
+    context_->fences().return_fence(fence);
     deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
+  } else {
+    submit_deferred_cmds_and_wait();
   }
 
-  submit_deferred_cmds_and_wait();
   execute_count_++;
 }
 
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
index 33c7ae73e62..08505aa3345 100644
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -50,6 +50,17 @@ struct GraphConfig final {
   // by taking more advantage of parallelism between the CPU and GPU.
   size_t prepack_initial_threshold_nbytes = 0;
 
+  // During execute, once this node count is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve execution
+  // performance.
+  size_t execute_threshold_node_count = 0;
+  // Execute node count used for the first command buffer submission during
+  // execute. This can be set to be lower than execute_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t execute_initial_threshold_node_count = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings