From 9a638b0bd287f00911bd746e23e1f9c8badabb6f Mon Sep 17 00:00:00 2001 From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:47:34 -0700 Subject: [PATCH] [ET-VK] 8/n Split dispatches between multiple command buffers. This diff adds a config to limit the maximum number of command buffers created when splitting execution between multiple command buffers. This diff introduces a new configuration option, `execute_max_cmds`, to limit the maximum number of command buffers created when splitting execution between multiple command buffers. This feature allows for more efficient management of command buffers, particularly in scenarios where the number of nodes in the graph is large. Differential Revision: [D79575908](https://our.internmc.facebook.com/intern/diff/D79575908/) [ghstack-poisoned] --- .../vulkan/runtime/graph/ComputeGraph.cpp | 46 ++++++++++++++++--- backends/vulkan/runtime/graph/GraphConfig.h | 4 ++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index 7775165bc68..21fe363fb93 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -858,6 +858,41 @@ void ComputeGraph::execute() { context_->cmd_reset_querypool(); uint32_t encoded_node_count = 0; + const size_t total_node_count = execute_nodes_.size(); + size_t init_threshold = config_.execute_initial_threshold_node_count; + size_t count_threshold = config_.execute_threshold_node_count; + + // If max command buffer count is set, we need to adjust the thresholds to + // accommodate execution within the limit, if total command buffers with + // current thresholds would exceed execute_max_cmds + if (config_.execute_max_cmds > 0) { + // Worse case scenario we have one command buffer for nodes before init + // threshold and config_.execute_max_cmds - 1 command buffers for the rest + // of dispatches + + // If command buffers created after offsetting init_threshold would exceed + // max command buffer count, we need to adjust init and count thresholds + const bool slicing_exceeds_max_cmds = + (total_node_count - init_threshold) > + count_threshold * (config_.execute_max_cmds - 1); + if (total_node_count > init_threshold && slicing_exceeds_max_cmds) { + // Calculate the total number of commands that would be created if we + // were to slice the nodes using current thresholds + const double total_cmds = 1. + + ceil(double(total_node_count - init_threshold) / count_threshold); + + // Calculate the scale factor to apply to the thresholds to ensure that + // the total number of commands does not exceed max command buffer count + const double cmd_scale = total_cmds / config_.execute_max_cmds; + + // Apply the scale factor to the thresholds + init_threshold = static_cast(ceil(init_threshold * cmd_scale)); + count_threshold = static_cast(ceil( + double(total_node_count - init_threshold) / + (config_.execute_max_cmds - 1))); + } + } + for (std::unique_ptr& node : execute_nodes_) { node->encode(this); encoded_node_count++; @@ -865,14 +900,13 @@ void ComputeGraph::execute() { // Threshold is reached when the node count reached // execute_initial_threshold_node_count or if its a multiple of // execute_threshold_node_count. - const bool reached_threshold = - encoded_node_count >= config_.execute_initial_threshold_node_count && - ((encoded_node_count - config_.execute_initial_threshold_node_count) % - config_.execute_threshold_node_count == - 0); + const bool reached_threshold = encoded_node_count >= init_threshold && + ((encoded_node_count - init_threshold) % count_threshold == 0); // Create a new command buffer when threashold is reached - if (reached_threshold) { + // But avoid it if this is the last node, since last cmd buf is submitted + // after the loop + if (reached_threshold && encoded_node_count != total_node_count) { context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false); deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd())); context_->set_cmd(true); diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 08505aa3345..aa5cd8f8c4e 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -61,6 +61,10 @@ struct GraphConfig final { // by taking more advantage of parallelism between the CPU and GPU. size_t execute_initial_threshold_node_count = 0; + // If this number is greater than 0 then, during execute create at most this + // many command buffers. + size_t execute_max_cmds = 0; + vkapi::Adapter* external_adapter; // Generate a default graph config with pre-configured settings