From 3e79c89455edac7ba7908ba8e34b3a2622662fa8 Mon Sep 17 00:00:00 2001
From: jorgep31415 <jorgepineda140@gmail.com>
Date: Tue, 3 Jun 2025 16:27:27 -0700
Subject: [PATCH 1/2] [ET-VK] Consolidate shader compilation into one
 vkCreateComputePipelines call

We target the QC Adreno driver implementation of Vulkan. The Vulkan API does not enforce how QC actually uses the cache. As the plural naming of `vkCreateComputePipelines` suggests, we observed that the `createInfoCount`, `pCreateInfos` and `pPipelines` arguments above allow construction of multiple compute pipelines in one invocation. We refactor ET-VK to accumulate metadata necessary for pipeline construction and invoke vkCreateComputePipelines only once. QC's implementation maximizes the cache if we create the same number of compute pipelines in fewer invocations of vkCreateComputePipelines. This decreases model load for a sample model from 1.7s to 1.0s, and down to 300ms once @ssjia removes the noop shader.

Differential Revision: [D75763660](https://our.internmc.facebook.com/intern/diff/D75763660/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     | 40 +++++++++++
 backends/vulkan/runtime/graph/ComputeGraph.h  | 11 +++
 .../vulkan/runtime/graph/ops/DispatchNode.cpp |  5 ++
 .../vulkan/runtime/graph/ops/DispatchNode.h   |  2 +
 .../vulkan/runtime/graph/ops/ExecuteNode.h    |  4 ++
 .../vulkan/runtime/graph/ops/PrepackNode.cpp  |  7 ++
 .../vulkan/runtime/graph/ops/PrepackNode.h    |  2 +
 backends/vulkan/runtime/vk_api/Pipeline.cpp   | 70 ++++++++++++++++++-
 backends/vulkan/runtime/vk_api/Pipeline.h     |  8 +++
 9 files changed, 148 insertions(+), 1 deletion(-)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 1222a9fc641..a7c5ce27976 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -552,6 +552,38 @@ void ComputeGraph::update_descriptor_counts(
   }
 }
 
+void ComputeGraph::update_pipeline_descriptors(
+    const vkapi::ShaderInfo& shader_info,
+    const utils::WorkgroupSize& local_workgroup_size,
+    const vkapi::SpecVarList& spec_vars,
+    const std::vector<PushConstantDataInfo>& push_constants) {
+  VkDescriptorSetLayout shader_layout =
+      context()->shader_layout_cache().retrieve(shader_info.kernel_layout);
+
+  uint32_t pc_offset = 0;
+  std::array<uint8_t, kMaxPushConstantSize> pc_data;
+  for (const auto& pc : push_constants) {
+    pc_offset += pc.write(pc_data.data(), pc_offset, kMaxPushConstantSize);
+  }
+
+  vkapi::SpecVarList spec_constants = {
+      SV(local_workgroup_size[0u]),
+      SV(local_workgroup_size[1u]),
+      SV(local_workgroup_size[2u])};
+
+  spec_constants.append(spec_vars);
+
+  const vkapi::ComputePipelineCache::Key desc = {
+      context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
+      context()->shader_cache().retrieve(shader_info),
+      spec_constants};
+
+  auto it = pipeline_descriptors_.find(desc);
+  if (it == pipeline_descriptors_.cend()) {
+    pipeline_descriptors_.insert(desc);
+  }
+}
+
 utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
     return {uint32_t(numel_of(idx)), 1u, 1u};
@@ -659,6 +691,14 @@ void ComputeGraph::prepare() {
     shared_object.allocate(this);
     shared_object.bind_users(this);
   }
+
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    node->prepare_pipelines(this);
+  }
+  for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
+    node->prepare_pipelines(this);
+  }
+  context_->pipeline_cache().create_pipelines(pipeline_descriptors_);
 }
 
 void ComputeGraph::encode_prepack() {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index fe546f26477..7a7bb75c2c7 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -185,6 +185,11 @@ class ComputeGraph final {
   std::vector<IOValueRef> inputs_;
   std::vector<IOValueRef> outputs_;
 
+  std::unordered_set<
+      vkapi::ComputePipelineCache::Key,
+      vkapi::ComputePipelineCache::Hasher>
+      pipeline_descriptors_;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
@@ -704,6 +709,12 @@ class ComputeGraph final {
       const vkapi::ShaderInfo& shader_info,
       bool execute);
 
+  void update_pipeline_descriptors(
+      const vkapi::ShaderInfo& shader_info,
+      const utils::WorkgroupSize& local_workgroup_size,
+      const vkapi::SpecVarList& spec_vars,
+      const std::vector<PushConstantDataInfo>& push_constants);
+
   void prepare();
 
   //
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index a0d3a4c2e5c..3421f133571 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -35,6 +35,11 @@ DispatchNode::DispatchNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
+void DispatchNode::prepare_pipelines(ComputeGraph* graph) {
+  graph->update_pipeline_descriptors(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_);
+}
+
 void DispatchNode::encode(ComputeGraph* graph) {
   if (!shader_) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
index db95adfee39..b6eb8624c26 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -40,6 +40,8 @@ class DispatchNode : public ExecuteNode {
 
   ~DispatchNode() override = default;
 
+  void prepare_pipelines(ComputeGraph* graph) override;
+
   void encode(ComputeGraph* graph) override;
 
  protected:
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 0731722e13a..6a815b246ef 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -61,6 +61,10 @@ class ExecuteNode {
 
   virtual ~ExecuteNode() = default;
 
+  virtual void prepare_pipelines(ComputeGraph* graph) {
+    (void)graph;
+  }
+
   virtual void encode(ComputeGraph* graph) {
     (void)graph;
   }
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index d84d893540c..328b31f20f7 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -67,6 +67,13 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   return staging;
 }
 
+void PrepackNode::prepare_pipelines(ComputeGraph* graph) {
+  graph->update_pipeline_descriptors(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_);
+  graph->update_pipeline_descriptors(
+      noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, push_constants_);
+}
+
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
index a45deb9ff70..8ce8ac9f773 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -40,6 +40,8 @@ class PrepackNode final {
 
   ~PrepackNode() = default;
 
+  void prepare_pipelines(ComputeGraph* graph);
+
   void encode(ComputeGraph* graph);
 
   inline void set_node_id(uint32_t node_id) {
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 5dcb00168b2..287332ff76a 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -270,6 +270,9 @@ void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept {
 // ComputePipeline
 //
 
+ComputePipeline::ComputePipeline(VkDevice device, VkPipeline handle)
+    : device_{device}, handle_{handle} {}
+
 ComputePipeline::ComputePipeline(
     VkDevice device,
     const ComputePipeline::Descriptor& descriptor,
@@ -444,12 +447,77 @@ ComputePipelineCache::~ComputePipelineCache() {
   pipeline_cache_ = VK_NULL_HANDLE;
 }
 
+void ComputePipelineCache::create_pipelines(
+    const std::unordered_set<Key, Hasher>& descriptors) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  const auto num_pipelines = descriptors.size();
+  std::vector<VkPipeline> pipelines(num_pipelines);
+
+  std::vector<std::vector<VkSpecializationMapEntry>> map_entries;
+  map_entries.reserve(num_pipelines);
+
+  std::vector<VkSpecializationInfo> specialization_infos;
+  specialization_infos.reserve(num_pipelines);
+
+  std::vector<VkPipelineShaderStageCreateInfo> shader_stage_create_infos;
+  shader_stage_create_infos.reserve(num_pipelines);
+
+  std::vector<VkComputePipelineCreateInfo> create_infos;
+  create_infos.reserve(num_pipelines);
+
+  for (auto& key : descriptors) {
+    map_entries.push_back(key.specialization_constants.generate_map_entries());
+
+    specialization_infos.push_back(VkSpecializationInfo{
+        key.specialization_constants.size(), // mapEntryCount
+        map_entries.back().data(), // pMapEntries
+        key.specialization_constants.data_nbytes(), // dataSize
+        key.specialization_constants.data(), // pData
+    });
+
+    shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
+        nullptr, // pNext
+        0u, // flags
+        VK_SHADER_STAGE_COMPUTE_BIT, // stage
+        key.shader_module, // module
+        "main", // pName
+        &specialization_infos.back(), // pSpecializationInfo
+    });
+
+    create_infos.push_back(VkComputePipelineCreateInfo{
+        VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType
+        nullptr, // pNext
+        0u, // flags
+        shader_stage_create_infos.back(), // stage
+        key.pipeline_layout, // layout
+        VK_NULL_HANDLE, // basePipelineHandle
+        0u, // basePipelineIndex
+    });
+  }
+
+  VK_CHECK(vkCreateComputePipelines(
+      device_,
+      pipeline_cache_,
+      create_infos.size(),
+      create_infos.data(),
+      nullptr,
+      pipelines.data()));
+
+  uint32_t i = 0;
+  for (auto& key : descriptors) {
+    cache_.insert({key, ComputePipelineCache::Value(device_, pipelines[i])});
+    ++i;
+  }
+}
+
 VkPipeline ComputePipelineCache::retrieve(
     const ComputePipelineCache::Key& key) {
   std::lock_guard<std::mutex> lock(cache_mutex_);
-
   auto it = cache_.find(key);
   if (cache_.cend() == it) {
+    // Pipelines for dynamic shapes must be created individually
     it = cache_
              .insert(
                  {key,
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 1e0fc1e28aa..74be28c56f7 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -19,6 +19,7 @@
 
 #include <mutex>
 #include <unordered_map>
+#include <unordered_set>
 
 #define SV(x) ::vkcompute::vkapi::SpecVar(x)
 
@@ -158,6 +159,8 @@ class ComputePipeline final {
     SpecVarList specialization_constants;
   };
 
+  explicit ComputePipeline(VkDevice device, VkPipeline handle);
+
   explicit ComputePipeline(
       VkDevice device,
       const Descriptor& descriptor,
@@ -185,6 +188,10 @@ class ComputePipeline final {
   // does not allow for move assignment. The swap function will
   // be used in the hash map.
   friend void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept;
+
+  friend bool operator==(
+      const ComputePipeline::Descriptor& _1,
+      const ComputePipeline::Descriptor& _2);
 };
 
 class PipelineLayoutCache final {
@@ -293,6 +300,7 @@ class ComputePipelineCache final {
   const std::string cache_data_path_;
 
  public:
+  void create_pipelines(const std::unordered_set<Key, Hasher>&);
   VkPipeline retrieve(const Key&);
   void purge();
 };

From f18956bbac84517ad81465d4a1e983631eedd1cb Mon Sep 17 00:00:00 2001
From: jorgep31415 <jorgepineda140@gmail.com>
Date: Tue, 3 Jun 2025 17:25:11 -0700
Subject: [PATCH 2/2] Update on "[ET-VK] Consolidate shader compilation into
 one vkCreateComputePipelines call"

We target the QC Adreno driver implementation of Vulkan. The Vulkan API does not enforce how QC actually uses the cache. As the plural naming of `vkCreateComputePipelines` suggests, we observed that the `createInfoCount`, `pCreateInfos` and `pPipelines` arguments above allow construction of multiple compute pipelines in one invocation. We refactor ET-VK to accumulate metadata necessary for pipeline construction and invoke vkCreateComputePipelines only once. QC's implementation maximizes the cache if we create the same number of compute pipelines in fewer invocations of vkCreateComputePipelines. This decreases model load for a sample model from 1.7s to 1.0s, and down to 300ms once ssjia removes the noop shader.

Differential Revision: [D75763660](https://our.internmc.facebook.com/intern/diff/D75763660/)

[ghstack-poisoned]