From 6f5ae562a3e884fcd7464ce5748e4565f70df051 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 17 Dec 2024 11:56:37 -0800
Subject: [PATCH 1/2] [ET-VK] Replace Uniform buffers with push constants for 
 binary op

Pull Request resolved: https://github.com/pytorch/executorch/pull/7230

This diff replaces uniform buffers with push constants for binary op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader.
ghstack-source-id: 258575398
@exported-using-ghexport

Differential Revision: [D66853542](https://our.internmc.facebook.com/intern/diff/D66853542/)
---
 .../vulkan/runtime/graph/ops/DispatchNode.h     |  9 ++++++---
 .../runtime/graph/ops/glsl/binary_op.glsl       | 13 ++++++++-----
 .../vulkan/runtime/graph/ops/impl/BinaryOp.cpp  | 17 ++++++++++-------
 .../vulkan/test/vulkan_compute_api_test.cpp     | 11 ++++-------
 4 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
index 958637218e2..7d04f7714e9 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -46,12 +46,15 @@ class PushConstantDataInfo {
     payload_.attr = attr;
   }
 
-  explicit PushConstantDataInfo(const void* data, uint32_t dataLen)
+  explicit PushConstantDataInfo(
+      const void* data,
+      uint32_t dataLen,
+      uint32_t pushConstantLen = 0)
       : tensorUniformData(nullptr) {
     VK_CHECK_COND(
         dataLen <= 16, "Single push constant data size must be <= 16 bytes");
-    payload_.dataSize = dataLen;
-    memcpy(payload_.data, data, payload_.dataSize);
+    payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen;
+    memcpy(payload_.data, data, dataLen);
   }
 
   /*
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index be0e1bfa20a..62aa2f810dc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -19,11 +19,6 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec4", "other_sizes")}
-${layout_declare_ubo(B, "ivec2", "broadcast_params")}
-${layout_declare_ubo(B, "float", "alpha")}
 
 #include "broadcasting_utils.h"
 #include "indexing_utils.h"
@@ -40,6 +35,14 @@ const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
 const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  ivec4 other_sizes;
+  ivec2 broadcast_params;
+  float alpha;
+};
+
 void main() {
   const ivec3 lpos = ivec3(gl_GlobalInvocationID);
   const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim);
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 33f73cd6dad..7e88982aaee 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -67,7 +67,10 @@ void add_binary_op_node(
     alpha_val = graph.extract_scalar<float>(alpha);
   }
 
-  const utils::ivec2 broadcast_params = create_broadcast_params(*t_in1, *t_in2);
+  const struct BinaryOpsParams {
+    const utils::ivec2 broadcast_params;
+    const float alpha_val;
+  } binary_ops_params{create_broadcast_params(*t_in1, *t_in2), alpha_val};
 
   std::string kernel_name("binary_");
   kernel_name.reserve(kShaderNameReserve);
@@ -83,16 +86,16 @@ void add_binary_op_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {{arg1, arg2}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->sizes_ubo(),
-       t_in1->sizes_ubo(),
-       t_in2->sizes_ubo(),
-       graph.create_params_buffer(broadcast_params),
-       graph.create_params_buffer(alpha_val)},
+      {},
       // Specialization Constants
       {t_out->hashed_layout(), t_in1->hashed_layout(), t_in2->hashed_layout()},
       // Resizing Logic
       resize_binary_op_node,
-      {}));
+      {},
+      {{graph.sizes_pc_of(out),
+        graph.sizes_pc_of(arg1),
+        graph.sizes_pc_of(arg2),
+        PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}}));
 }
 
 #define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 77a0458d901..604ad26588d 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1601,9 +1601,7 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto addFn = VK_GET_OP_FN("aten.add.Tensor");
   addFn(graph, {a.value, b.value, kDummyValueRef, c});
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for arithmetic shader output c
-  expected_vma_allocation_count += 3;
+  // no new allocations if binary op uses push constants
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef d = graph.add_input_tensor(
@@ -1624,17 +1622,16 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
   mulFn(graph, {c, d.value, e});
 
-  // +2: alpha UBO, broadcast UBO for arithmetic shader
-  // +1: t.sizes_ubo() for arithmetic shader output e
-  expected_vma_allocation_count += 3;
+  // no new allocations if binary op uses push constants
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   IOValueRef out = {};
   out.value = e;
   out.staging = graph.set_output_tensor(out.value);
 
+  // +1: staging buffer input tensor
   // +1: staging buffer for the output tensor
-  expected_vma_allocation_count += 1;
+  expected_vma_allocation_count += 2;
   EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
 
   graph.prepare();

From 13587c28155d86970bfe5a82678e4a7cdddd4db3 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Tue, 17 Dec 2024 11:56:38 -0800
Subject: [PATCH 2/2] [ET-VK] Replace Uniform buffers with push constants for
 permute op

Pull Request resolved: https://github.com/pytorch/executorch/pull/7231

This diff replaces uniform buffers with push constants for permute op in the Vulkan backend of Executorch. The changes include updating the GLSL code to use push constants instead of uniform buffers and updating the C++ code to pass the sizes as push constants to the shader.
ghstack-source-id: 258575396
@exported-using-ghexport

Differential Revision: [D66890825](https://our.internmc.facebook.com/intern/diff/D66890825/)
---
 .../vulkan/runtime/graph/ops/glsl/permute.glsl | 12 +++---------
 .../vulkan/runtime/graph/ops/impl/Permute.cpp  | 18 +++++++-----------
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
index 5378099d03f..59d6aecdc15 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -19,15 +19,9 @@ layout(std430) buffer;
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
 layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
 
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
+layout(push_constant) uniform PRECISION restrict Block {
+  ivec4 out_limits;
   ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict Block {
   // output dims
   ivec4 out_ndims;
   // x = output channels aligned to 4, y = input channels aligned to 4
@@ -41,7 +35,7 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   const u16vec3 pos = u16vec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (any(greaterThanEqual(pos, out_limits.xyz))) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
index c107f288f34..a56925751e7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
@@ -75,13 +75,7 @@ void add_permute_node(
   int32_t out_c_aligned = utils::align_up_4(out_channels);
   int32_t in_c_aligned = utils::align_up_4(in_channels);
 
-  const struct Block final {
-    ivec4 out_ndims;
-    ivec2 ch_info;
-  } params{
-      out_dims,
-      {out_c_aligned, in_c_aligned},
-  };
+  const ivec2 ch_info = {out_c_aligned, in_c_aligned};
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
@@ -90,14 +84,16 @@ void add_permute_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->logical_limits_ubo(),
-       t_out->sizes_ubo(),
-       graph.create_params_buffer(params)},
+      {},
       // Specialization Constants
       {},
       // Resizing Logic
       nullptr,
-      {}));
+      {},
+      {{graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(out),
+        PushConstantDataInfo(&out_dims, sizeof(out_dims)),
+        PushConstantDataInfo(&ch_info, sizeof(ch_info))}}));
 }
 
 void add_permute_node(