Skip to content
Merged
29 changes: 28 additions & 1 deletion backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ ComputeGraph::ComputeGraph(GraphConfig config)
config_.prepack_threshold_nbytes = 10 * MB;
config_.prepack_initial_threshold_nbytes = 10 * MB;
}
if (config_.execute_threshold_node_count == 0) {
config_.execute_threshold_node_count = 128;
config_.execute_initial_threshold_node_count = 64;
}
}

ComputeGraph::~ComputeGraph() {
Expand Down Expand Up @@ -852,15 +856,38 @@ void ComputeGraph::execute() {
context_->set_cmd(/*reusable = */ true);

context_->cmd_reset_querypool();
uint32_t encoded_node_count = 0;

for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
node->encode(this);
encoded_node_count++;

// Threshold is reached when the node count reached
// execute_initial_threshold_node_count or if its a multiple of
// execute_threshold_node_count.
const bool reached_threshold =
encoded_node_count >= config_.execute_initial_threshold_node_count &&
((encoded_node_count - config_.execute_initial_threshold_node_count) %
config_.execute_threshold_node_count ==
0);

// Create a new command buffer when threashold is reached
if (reached_threshold) {
context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
context_->set_cmd(true);
}
}

vkapi::VulkanFence fence = context_->fences().get_fence();
context_->submit_cmd_to_gpu(fence.get_submit_handle(), false);
fence.wait();
context_->fences().return_fence(fence);
deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
} else {
submit_deferred_cmds_and_wait();
}

submit_deferred_cmds_and_wait();
execute_count_++;
}

Expand Down
11 changes: 11 additions & 0 deletions backends/vulkan/runtime/graph/GraphConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ struct GraphConfig final {
// by taking more advantage of parallelism between the CPU and GPU.
size_t prepack_initial_threshold_nbytes = 0;

// During execute, once this node count is reached, submit the current
// command buffer for execution. This allows the work to be distributed over
// multiple command buffer submissions, which can improve execution
// performance.
size_t execute_threshold_node_count = 0;
// Execute node count used for the first command buffer submission during
// execute. This can be set to be lower than execute_threshold_nbytes to
// submit a command buffer for execution earlier which can improve performance
// by taking more advantage of parallelism between the CPU and GPU.
size_t execute_initial_threshold_node_count = 0;

vkapi::Adapter* external_adapter;

// Generate a default graph config with pre-configured settings
Expand Down
Loading