From cb69a5b02562e02994415fd2289601e3965385ac Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:34 -0700
Subject: [PATCH 1/9] [ET-VK] De vectorise conv2d pw shader to improve perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11108

This diff optimizes the performance of the `conv2d_pw` shader by de-vectorizing its implementation.

*   The original vectorized implementation of the `conv2d_pw` shader has been replaced with a de-vectorized approach to improve performance.
*   The `sum` array has been redefined to hold `float` values instead of `vec4` to accommodate the de-vectorized computation.

These changes seem to allow shader compiler to better optimize operations within the shader hence improving perf.
ghstack-source-id: 286652100
@exported-using-ghexport

Differential Revision: [D75307267](https://our.internmc.facebook.com/intern/diff/D75307267/)
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index 468b91f0535..0ee7b94a59a 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -88,10 +88,18 @@ void main() {
     ipos[i] = pos[i] * stride - padding;
   }
 
-  vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
-  sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    sum[i] = sum[0];
+  // Final output array where each element is a tensor value.
+  // Tuple of consecutive 4 elements represents a single output texel.
+  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+
+  const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
+
+  // Initialize the output array with the bias value
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
+    sum[i] = bias.x;
+    sum[i + 1] = bias.y;
+    sum[i + 2] = bias.z;
+    sum[i + 3] = bias.w;
   }
 
   int z4 = 0;
@@ -100,14 +108,26 @@ void main() {
     // During prepacking, the weight tensor has been permuted so that the
     // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
     // the z-axis.
-    const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
-    const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
-    const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
-    const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
+    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
+
+    // Load kernel values from texels to array
+    for (int i = 0; i < 4; ++i) {
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
+      kernel_values[i * 4 + 0] = k_tex.x;
+      kernel_values[i * 4 + 1] = k_tex.y;
+      kernel_values[i * 4 + 2] = k_tex.z;
+      kernel_values[i * 4 + 3] = k_tex.w;
+    }
 
-#pragma unroll
     for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
       const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
+      // Load the input texel into an array
+      float tex_values[4];
+      tex_values[0] = in_tex.x;
+      tex_values[1] = in_tex.y;
+      tex_values[2] = in_tex.z;
+      tex_values[3] = in_tex.w;
+
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
       // group of 4 texels loaded from t_kernel are shown:
@@ -141,10 +161,12 @@ void main() {
       //
       //  which is what is expressed in the following calculations. This is done
       //  for each output position.
-      sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
-      sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
-      sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
-      sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
+      for (int j = 0; j < 4; ++j) {
+        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
+      }
     }
   }
 
@@ -152,7 +174,7 @@ void main() {
     const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
     const ivec3 pos = pos_shared[offset_pos_index(index)];
     if (all(lessThan(pos, out_limits.xyz))) {
-      imageStore(t_out, pos, op(sum[i], out_min, out_max));
+      imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }

From 83ad3d323964efebb627d25c63a4622c5a54e503 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:37 -0700
Subject: [PATCH 2/9] [ET-VK] Remove the use of shared memory in conv2d pw to
 improve perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11110

This diff removes the use of shared memory in the conv2d pw (pointwise) operation to improve performance.
ghstack-source-id: 286652103

Differential Revision: [D75316188](https://our.internmc.facebook.com/intern/diff/D75316188/)
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 32 +++++++------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index 0ee7b94a59a..552037247fd 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -14,7 +14,6 @@
 
 #define TILE_SIZE_X ${TILE_SIZE_X}
 #define TILE_SIZE_Y ${TILE_SIZE_Y}
-#define LOCAL_WG_SIZE 64
 
 #define op(X, A, B) ${OPERATOR}
 
@@ -39,11 +38,6 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// For performance improvement, reduce register usage by caching positions in shared memory.
-// Offset index by 1 every 16 points to avoid bank access conflict.
-#define offset_pos_index(index) (index + ((index) >> 4))
-shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
-
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
@@ -51,7 +45,6 @@ shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE
  */
 void main() {
   const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
-  const uint shared_mem_stride = LOCAL_WG_SIZE;
 
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
   const ivec3 gpos = ivec3(
@@ -59,33 +52,32 @@ void main() {
     div_by_x % out_limits_scaled.y,
     div_by_x / out_limits_scaled.y);
 
+  // If the top left position is out of bounds, then this invocation will have
+  // no work to do.
+  if (gpos.z >= out_limits.z) {
+    return;
+  }
+
   // Output position for TILE_SIZE = 2
   // +--------+--------+
   // | pos[0] | pos[1] |
   // +--------+--------+
   // | pos[2] | pos[3] |
   // +--------+--------+
-  ivec2 pos[TILE_SIZE_X * TILE_SIZE_Y];
+  ivec3 pos[TILE_SIZE_X * TILE_SIZE_Y];
   for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
     for (int x = 0; x < TILE_SIZE_X; ++x) {
-      pos[i] = ivec2(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y);
-      pos_shared[offset_pos_index((shared_mem_stride * i) + gl_LocalInvocationIndex)] = ivec3(pos[i], gpos.z);
+      pos[i] = ivec3(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y, gpos.z);
       i++;
     }
   }
 
-  // If the top left position is out of bounds, then this invocation will have
-  // no work to do.
-  if (gpos.z >= out_limits.z) {
-    return;
-  }
-
   // Compute the index of the input texture that needs to be loaded for each
   // output position. Note that negative indices can be produced indicating that
   // the top-left element is in a region added by padding.
   ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    ipos[i] = pos[i] * stride - padding;
+    ipos[i] = pos[i].xy * stride - padding;
   }
 
   // Final output array where each element is a tensor value.
@@ -171,10 +163,8 @@ void main() {
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
-    const ivec3 pos = pos_shared[offset_pos_index(index)];
-    if (all(lessThan(pos, out_limits.xyz))) {
-      imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
+    if (all(lessThan(pos[i], out_limits.xyz))) {
+      imageStore(t_out, pos[i], op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }

From 843f7d7be27659bbfb505dc9dbbd61710120e19f Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:39 -0700
Subject: [PATCH 3/9] [ET-VK] Tuning conv 2d pw op tile size to improve perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11112

This diff tunes the tile size for the conv 2d pw op to improve performance. The changes include updating the `TILE_SIZE_X` and `TILE_SIZE_Y` values in the `conv2d_pw.yaml` files and modifying the `Convolution.cpp` files to adjust the image extents calculation. The `TILE_SIZE_X` value is changed from 2 to 1, and the `TILE_SIZE_Y` value is changed from 2 to 4.
ghstack-source-id: 286652109

Differential Revision: [D75317820](https://our.internmc.facebook.com/intern/diff/D75317820/)
---
 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml  | 4 ++--
 backends/vulkan/runtime/graph/ops/impl/Convolution.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
index 1f0e8fb71be..d4cb69d7648 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
@@ -9,8 +9,8 @@ conv2d_pw:
     OPERATOR: X
     NDIM: 3
     DTYPE: float
-    TILE_SIZE_X: 2
-    TILE_SIZE_Y: 2
+    TILE_SIZE_X: 1
+    TILE_SIZE_Y: 4
   generate_variant_forall:
     DTYPE:
       - VALUE: half
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 381b9de0d6a..a0ac58ea9bc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -305,8 +305,8 @@ utils::uvec3 create_conv2d_global_wg_size(
   if (method == Conv2dMethod::Pointwise) {
     const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
-        utils::div_up(image_extents[0u], 2u),
-        utils::div_up(image_extents[1u], 2u),
+        utils::div_up(image_extents[0u], 1u),
+        utils::div_up(image_extents[1u], 4u),
         image_extents[2u]};
   } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
     const utils::uvec3 image_extents = graph.create_global_wg_size(out);

From c4780f3d1893cd810b9fcfab8e086ce9633c87d3 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:42 -0700
Subject: [PATCH 4/9] [ET-VK] Minor tuning for conv2d pw op to improve
 performance.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11113

The diff introduces minor tuning for the Conv2d pointwise (PW) operation in the Vulkan backend to improve performance.
Conv 2d pw now issues a 2D dispatch instead of 1D, where dispatch axis y is now sized based on output texture's batch size.
ghstack-source-id: 286652099

Differential Revision: [D75251145](https://our.internmc.facebook.com/intern/diff/D75251145/)
---
 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl  | 6 +++---
 backends/vulkan/runtime/graph/ops/impl/Convolution.cpp | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index 552037247fd..e44a41fc9bc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -49,12 +49,12 @@ void main() {
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
   const ivec3 gpos = ivec3(
     gl_GlobalInvocationID.x % out_limits_scaled.x,
-    div_by_x % out_limits_scaled.y,
-    div_by_x / out_limits_scaled.y);
+    div_by_x,
+    gl_GlobalInvocationID.y);
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (gpos.z >= out_limits.z) {
+  if (gpos.y >= out_limits_scaled.y || gpos.z >= out_limits.z) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index a0ac58ea9bc..5250c3baef2 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -398,8 +398,10 @@ void add_conv2d_node(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
+  if (method == Conv2dMethod::Depthwise) {
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
+  } else if (method == Conv2dMethod::Pointwise) {
+    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
   vkapi::ParamsBindList param_buffers;

From 8b3eba79836670a0a96404f4ed8a56a872153b1f Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:44 -0700
Subject: [PATCH 5/9] [ET-VK] De vectorise positions in conv2d pw shader to
 improve perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11122

This improves the performance of the conv2d pw shader by de-vectorizing position storage.
The optimization involved replacing the `ivec3 pos` array with a plain `int pos` array to store the position values. The `x` and `y` coordinates are now stored in separate elements of the array instead of being stored together in an `ivec3`. This change allows for more efficient memory access and computation.
ghstack-source-id: 286652097
@exported-using-ghexport

Differential Revision: [D75335802](https://our.internmc.facebook.com/intern/diff/D75335802/)
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl       | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index e44a41fc9bc..ed07979afc0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -64,10 +64,11 @@ void main() {
   // +--------+--------+
   // | pos[2] | pos[3] |
   // +--------+--------+
-  ivec3 pos[TILE_SIZE_X * TILE_SIZE_Y];
+  int pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
   for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
     for (int x = 0; x < TILE_SIZE_X; ++x) {
-      pos[i] = ivec3(gpos.x * TILE_SIZE_X + x, gpos.y * TILE_SIZE_Y + y, gpos.z);
+      pos[i * 2] = gpos.x * TILE_SIZE_X + x;
+      pos[i * 2 + 1] = gpos.y * TILE_SIZE_Y + y;
       i++;
     }
   }
@@ -75,9 +76,10 @@ void main() {
   // Compute the index of the input texture that needs to be loaded for each
   // output position. Note that negative indices can be produced indicating that
   // the top-left element is in a region added by padding.
-  ivec2 ipos[TILE_SIZE_X * TILE_SIZE_Y];
+  int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2];
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    ipos[i] = pos[i].xy * stride - padding;
+    ipos[i * 2] = pos[i * 2] * stride.x - padding.x;
+    ipos[i * 2 + 1] = pos[i * 2 + 1] * stride.y - padding.y;
   }
 
   // Final output array where each element is a tensor value.
@@ -112,7 +114,7 @@ void main() {
     }
 
     for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-      const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
+      const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i * 2], ipos[i * 2 + 1], z4), 0);
       // Load the input texel into an array
       float tex_values[4];
       tex_values[0] = in_tex.x;
@@ -163,8 +165,9 @@ void main() {
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    if (all(lessThan(pos[i], out_limits.xyz))) {
-      imageStore(t_out, pos[i], op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
+    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], gpos.z);
+    if (all(lessThan(pos_l, out_limits.xyz))) {
+      imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }

From 73481011b97fa9d7800814d5f95c9f47f153f7ac Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:47 -0700
Subject: [PATCH 6/9] [ET-VK] Minor unroll tuning to improve conv2d pw perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11134

This diff provides a minor unroll tuning to improve the performance of the conv2d pointwise (pw) operation in the Executorch Vulkan backend.
ghstack-source-id: 286652101
@exported-using-ghexport

Differential Revision: [D75420510](https://our.internmc.facebook.com/intern/diff/D75420510/)
---
 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index ed07979afc0..c090c5d344f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -38,6 +38,8 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+#extension GL_EXT_control_flow_attributes : require
+
 /*
  * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
  * output tile for pointwise convolution is more efficient because the kernel
@@ -105,7 +107,7 @@ void main() {
     float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
 
     // Load kernel values from texels to array
-    for (int i = 0; i < 4; ++i) {
+    [[unroll]] for (int i = 0; i < 4; ++i) {
       const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
       kernel_values[i * 4 + 0] = k_tex.x;
       kernel_values[i * 4 + 1] = k_tex.y;

From e275147af8482b36dfc7aece327b961cbcdcfdc8 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:48 -0700
Subject: [PATCH 7/9] [ET-VK] Tuning local workgroup size calculation for
 conv2d pw to improve performance.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11135

This diff adjusts the local workgroup size (`local_wg_size`) based on batch count (stored in  `wg_size[1]`), to improve conv2d pw performance.

* If `wg_size[1]` is a multiple of 8, `local_wg_size_y` is set to 8.
* If `wg_size[1]` is a multiple of 4, `local_wg_size_y` is set to 4.
* If `wg_size[1]` is a multiple of 2, `local_wg_size_y` is set to 2.
* Otherwise, we default to `local_wg_size_y` = 1.

The dispatch size in 2 dimensions is then calculate based on `{64 / local_wg_size_y, local_wg_size_y, 1}`.
ghstack-source-id: 286652105
@exported-using-ghexport

Differential Revision: [D75420517](https://our.internmc.facebook.com/intern/diff/D75420517/)
---
 .../runtime/graph/ops/impl/Convolution.cpp      | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 5250c3baef2..ba1f50a23c1 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -404,6 +404,21 @@ void add_conv2d_node(
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
+  utils::uvec3 local_wg_size;
+  if (method == Conv2dMethod::Pointwise) {
+    uint32_t local_wg_size_y = 1;
+    if (wg_size[1] % 8 == 0) {
+      local_wg_size_y = 8;
+    } else if (wg_size[1] % 4 == 0) {
+      local_wg_size_y = 4;
+    } else if (wg_size[1] % 2 == 0) {
+      local_wg_size_y = 2;
+    }
+    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
+  } else {
+    local_wg_size = graph.create_local_wg_size(wg_size);
+  }
+
   vkapi::ParamsBindList param_buffers;
   std::vector<PushConstantDataInfo> push_constants;
   if (method == Conv2dMethod::Pointwise) {
@@ -464,7 +479,7 @@ void add_conv2d_node(
       graph,
       shader,
       wg_size,
-      graph.create_local_wg_size(wg_size),
+      local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
       // Shader params buffers

From fab78a41707c652339279e9bba611e30e0359f55 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:50 -0700
Subject: [PATCH 8/9] [ET-VK] De vectorise all vectors in conv2d pw shader to
 improve perf.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11136

This diff improves the performance of the conv2d pw shader by de-vectorizing all vectors.
ghstack-source-id: 286652098
@exported-using-ghexport

Differential Revision: [D75423245](https://our.internmc.facebook.com/intern/diff/D75423245/)
---
 .../runtime/graph/ops/glsl/conv2d_pw.glsl     | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
index c090c5d344f..c218b8ac8cc 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -46,17 +46,14 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
  */
 void main() {
-  const ivec2 out_limits_scaled = (out_limits.xy + ivec2(TILE_SIZE_X - 1, TILE_SIZE_Y - 1)) / ivec2(TILE_SIZE_X, TILE_SIZE_Y);
+  const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y};
 
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits_scaled.x;
-  const ivec3 gpos = ivec3(
-    gl_GlobalInvocationID.x % out_limits_scaled.x,
-    div_by_x,
-    gl_GlobalInvocationID.y);
+  const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]);
+  const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)};
 
   // If the top left position is out of bounds, then this invocation will have
   // no work to do.
-  if (gpos.y >= out_limits_scaled.y || gpos.z >= out_limits.z) {
+  if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) {
     return;
   }
 
@@ -69,8 +66,8 @@ void main() {
   int pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
   for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
     for (int x = 0; x < TILE_SIZE_X; ++x) {
-      pos[i * 2] = gpos.x * TILE_SIZE_X + x;
-      pos[i * 2 + 1] = gpos.y * TILE_SIZE_Y + y;
+      pos[i * 2] = out_pos[0] * TILE_SIZE_X + x;
+      pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y;
       i++;
     }
   }
@@ -88,7 +85,7 @@ void main() {
   // Tuple of consecutive 4 elements represents a single output texel.
   float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
 
-  const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
+  const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0);
 
   // Initialize the output array with the bias value
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
@@ -108,7 +105,7 @@ void main() {
 
     // Load kernel values from texels to array
     [[unroll]] for (int i = 0; i < 4; ++i) {
-      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0);
       kernel_values[i * 4 + 0] = k_tex.x;
       kernel_values[i * 4 + 1] = k_tex.y;
       kernel_values[i * 4 + 2] = k_tex.z;
@@ -167,7 +164,7 @@ void main() {
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], gpos.z);
+    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]);
     if (all(lessThan(pos_l, out_limits.xyz))) {
       imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }

From a5ac0562fb3365ace4d2542d4a33d001a3f98131 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Wed, 28 May 2025 07:09:52 -0700
Subject: [PATCH 9/9] [ET-VK] Creating specialized version of conv2d pw shader
 for X and Y stride = 1 and padding = 0.

Pull Request resolved: https://github.com/pytorch/executorch/pull/11137

This diff creates a specialized version of the conv2d pw shader for X and Y stride equals 1 and padding equals 0.

* It adds a new file `conv2d_pw_s1p0.glsl`, which implements the conv2d pw shader for X and Y stride equals 1 and padding equals 0.
* It adds a new file `conv2d_pw_s1p0.yaml`, which defines the parameters and shader variants for the specialized conv2d pw shader.
* The file `Convolution.cpp` is modified to add a new parameter `stride_1_padding_0` to the `conv2d` function, which enables the use of the specialized shader.
ghstack-source-id: 286652107
@exported-using-ghexport

Differential Revision: [D75423931](https://our.internmc.facebook.com/intern/diff/D75423931/)
---
 .../graph/ops/glsl/conv2d_pw_s1p0.glsl        | 163 ++++++++++++++++++
 .../graph/ops/glsl/conv2d_pw_s1p0.yaml        |  21 +++
 .../runtime/graph/ops/impl/Convolution.cpp    |  12 +-
 3 files changed, 193 insertions(+), 3 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
new file mode 100644
index 00000000000..36c7a61eb3d
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define TILE_SIZE_X ${TILE_SIZE_X}
+#define TILE_SIZE_Y ${TILE_SIZE_Y}
+
+#define op(X, A, B) ${OPERATOR}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec2 stride;
+  ivec2 padding;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#extension GL_EXT_control_flow_attributes : require
+
+/*
+ * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
+ * output tile for pointwise convolution is more efficient because the kernel
+ * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
+ */
+void main() {
+  const int out_limits_scaled[2] = {out_limits.x + (TILE_SIZE_X - 1) * TILE_SIZE_X, out_limits.y + (TILE_SIZE_Y - 1) * TILE_SIZE_Y};
+
+  const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]);
+  const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)};
+
+  // If the top left position is out of bounds, then this invocation will have
+  // no work to do.
+  if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) {
+    return;
+  }
+
+  // Output position for TILE_SIZE = 2
+  // +--------+--------+
+  // | pos[0] | pos[1] |
+  // +--------+--------+
+  // | pos[2] | pos[3] |
+  // +--------+--------+
+  int pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
+  for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
+    for (int x = 0; x < TILE_SIZE_X; ++x) {
+      pos[i * 2] = out_pos[0] * TILE_SIZE_X + x;
+      pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y;
+      i++;
+    }
+  }
+
+  // Final output array where each element is a tensor value.
+  // Tuple of consecutive 4 elements represents a single output texel.
+  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+
+  const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0);
+
+  // Initialize the output array with the bias value
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
+    sum[i] = bias.x;
+    sum[i + 1] = bias.y;
+    sum[i + 2] = bias.z;
+    sum[i + 3] = bias.w;
+  }
+
+  int z4 = 0;
+  // Since the kernel is 1x1, we only have to loop over the depth dimension.
+  for (int z = 0; z < in_group_size; z += 4, ++z4) {
+    // During prepacking, the weight tensor has been permuted so that the
+    // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
+    // the z-axis.
+    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
+
+    // Load kernel values from texels to array
+    [[unroll]] for (int i = 0; i < 4; ++i) {
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0);
+      kernel_values[i * 4 + 0] = k_tex.x;
+      kernel_values[i * 4 + 1] = k_tex.y;
+      kernel_values[i * 4 + 2] = k_tex.z;
+      kernel_values[i * 4 + 3] = k_tex.w;
+    }
+
+    for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
+      const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0);
+      // Load the input texel into an array
+      float tex_values[4];
+      tex_values[0] = in_tex.x;
+      tex_values[1] = in_tex.y;
+      tex_values[2] = in_tex.z;
+      tex_values[3] = in_tex.w;
+
+      // For 2x2 tile size algorithm works as follows.
+      // To explain the calculations below, the contents of one in_tex and the
+      // group of 4 texels loaded from t_kernel are shown:
+      //
+      //   in_tex                 t_kernel
+      //    -x->                   ---x--->
+      //   +---+              +----+----+----+----+
+      // ^ | w |           ^  | D0 | D1 | D2 | D3 |
+      // | +---+           |  +----+----+----+----+
+      // | | z |           |  | C0 | C1 | C2 | C3 |
+      // z +---+           z  +----+----+----+----+
+      // | | y |           |  | B0 | B2 | B2 | B3 |
+      // | +---+           |  +----+----+----+----+
+      //   | x |              | A0 | A1 | A2 | A3 |
+      //   +---+              +----+----+----+----+
+      //
+      // In the t_kernel graphic, cells sharing the same letter are from
+      // the same batch/output channel index, and the number denotes a unique
+      // channel index. To calculate the output texel, the following
+      // calculation is performed:
+      //
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
+      //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
+      //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
+      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
+      //
+      //  which is what is expressed in the following calculations. This is done
+      //  for each output position.
+      for (int j = 0; j < 4; ++j) {
+        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
+      }
+    }
+  }
+
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
+    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]);
+    if (all(lessThan(pos_l, out_limits.xyz))) {
+      imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
+    }
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
new file mode 100644
index 00000000000..ebfee11c405
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_pw_s1p0:
+  parameter_names_with_default_values:
+    OPERATOR: X
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE_X: 1
+    TILE_SIZE_Y: 4
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_pw_s1p0
+    - NAME: conv2d_pw_s1p0_clamp
+      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index ba1f50a23c1..fbe4a61befc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -127,7 +127,8 @@ vkapi::ShaderInfo get_conv2d_shader(
     const Conv2dMethod method,
     const ValueRef weight,
     const bool clamp_out = false,
-    const bool stride_equals_dilation = false) {
+    const bool stride_equals_dilation = false,
+    const bool stride_1_padding_0 = false) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
   switch (method) {
@@ -150,7 +151,7 @@ vkapi::ShaderInfo get_conv2d_shader(
       if (prepack_weights) {
         kernel_name = "conv2d";
       } else {
-        kernel_name = "conv2d_pw";
+        kernel_name = stride_1_padding_0 ? "conv2d_pw_s1p0" : "conv2d_pw";
       }
       break;
     case Conv2dMethod::SlidingWindow:
@@ -382,6 +383,10 @@ void add_conv2d_node(
       (kernel_params.stride[0] == kernel_params.dilation[0] &&
        kernel_params.stride[1] == kernel_params.dilation[1]);
 
+  const bool stride_1_padding_0 =
+      (kernel_params.stride[0] == 1 && kernel_params.stride[1] == 1 &&
+       kernel_params.padding[0] == 0 && kernel_params.padding[1] == 0);
+
   OutputParams out_params = {out_min_val, out_max_val};
 
   check_conv2d_params(kernel_params, transposed_val);
@@ -393,7 +398,8 @@ void add_conv2d_node(
       method,
       weight_data,
       clamp_out,
-      stride_equals_dilation);
+      stride_equals_dilation,
+      stride_1_padding_0);
 
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);