pytorch · facebook-github-bot · Jul 16, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 15, 2025
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->prepare();
     compute_graph->prepare_pipelines();
 
-    compute_graph->encode_prepack();
-    compute_graph->prepack();
+    compute_graph->run_prepack();
 
     // If dynamic shapes are not expected, then the command buffer only needs to
     // be encoded once. Otherwise, wait until the first inference to encode the

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -145,6 +145,15 @@ ComputeGraph::ComputeGraph(GraphConfig config)
   execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
   execute_descriptor_counts_.descriptor_storage_image_count = 0;
 
+#define MB (1024.0 * 1024.0)
+  // If certain graph config variables are not specified, then set them
+  // automatically.
+  if (config_.prepack_threshold_nbytes == 0) {
+    config_.prepack_threshold_nbytes = 20 * MB;
+    config_.prepack_initial_threshold_nbytes = 20 * MB;
+  }
+#undef MB
+
   context_->set_cmd(/*reusable = */ true);
 }
 
@@ -750,6 +759,15 @@ void ComputeGraph::prepare_pipelines() {
       vkapi::ComputePipelineCache::Hasher>();
 }
 
+void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
+  vkapi::VulkanFence fence = context_->fences().get_fence();
+  context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
+  fence.wait();
+  context_->fences().return_fence(fence);
+
+  context_->flush();
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
@@ -766,6 +784,28 @@ void ComputeGraph::prepack() const {
   context_->flush();
 }
 
+void ComputeGraph::run_prepack() {
+  int i = 0;
+  bool submitted = false;
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    // Do not trigger on the first or last prepack node.
+    const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
+    size_t threshold = submitted ? config_.prepack_threshold_nbytes
+                                 : config_.prepack_initial_threshold_nbytes;
+    if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
+      submit_current_cmd_and_wait(/*final_use=*/true);
+      staging_nbytes_in_cmd_ = 0;
+      context_->set_cmd();
+      submitted = true;
+    }
+
+    node->encode(this);
+    i++;
+  }
+  submit_current_cmd_and_wait(/*final_use=*/true);
+  staging_nbytes_in_cmd_ = 0;
+}
+
 void ComputeGraph::encode_execute() {
   context_->flush();
   context_->set_cmd(/*reusable = */ true);

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -194,6 +194,10 @@ class ComputeGraph final {
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
 
+  // Represents the amount of staging buffer data that will be copied if the
+  // current Context's command buffer is submitted now.
+  size_t staging_nbytes_in_cmd_ = 0;
+
  public:
   //
   // Accessors
@@ -812,13 +816,34 @@ class ComputeGraph final {
   copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
   void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
 
+ protected:
+  // Command Buffer Management
+
+  /*
+   * Submits the current command buffer in the Context to the GPU for execution,
+   * and wait for it to complete before returning. This function will also flush
+   * the Context after execution.
+   */
+  void submit_current_cmd_and_wait(const bool final_use = false);
+
+ public:
   //
   // Graph Prepacking
   //
 
+  inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
+    staging_nbytes_in_cmd_ += staging_bytes;
+  }
+
   void encode_prepack();
   void prepack() const;
 
+  /*
+   * Executes prepacking operations to transfer model weight data from the CPU
+   * to GPU.
+   */
+  void run_prepack();
+
   //
   // Graph Execution
   //

diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
@@ -36,6 +36,20 @@ struct GraphConfig final {
   // Whether or not the ComputeGraph should expect input shapes to be dynamic
   bool expect_dynamic_shapes;
 
+  // Execution properties that determine specifics re: how command buffer
+  // submission is handled, etc. 0 means this field is not set.
+
+  // During prepacking, once this threshold is reached, submit the current
+  // command buffer for execution. This allows the work to be distributed over
+  // multiple command buffer submissions, which can improve model load
+  // performance and prevent crashes when loading large models.
+  size_t prepack_threshold_nbytes = 0;
+  // Threshold used for the first command buffer submission during prepacking.
+  // This can be set to be lower than prepack_submission_threshold_nbytes to
+  // submit a command buffer for execution earlier which can improve performance
+  // by taking more advantage of parallelism between the CPU and GPU.
+  size_t prepack_initial_threshold_nbytes = 0;
+
   vkapi::Adapter* external_adapter;
 
   // Generate a default graph config with pre-configured settings

diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   TensorRefPtr tref = graph->get_tref(tref_);
   size_t numel = utils::multiply_integers(tref->sizes);
   api::StagingBuffer staging(graph->context(), tref->dtype, numel);
+  graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
   size_t nbytes = numel * vkapi::element_size(tref->dtype);
   staging.copy_from(tref->data, nbytes);
   return staging;

diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
@@ -138,6 +138,10 @@ class VulkanBuffer final {
     return buffer_properties_.size;
   }
 
+  inline size_t mem_size_as_size_t() const {
+    return utils::safe_downcast<size_t>(mem_size());
+  }
+
   inline bool has_memory() const {
     return (memory_.allocation != VK_NULL_HANDLE);
   }