diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index c05c7e4450d..37d3505077f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -38,6 +38,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
+// 64 is the number of threads in the local wg
+shared ivec3 pos_shared[64];
+
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
@@ -63,6 +67,8 @@ void main() {
     return;
   }
 
+  pos_shared[gl_LocalInvocationIndex] = pos;
+
   // Compute the index of the top-left element of the overlay region. Negative
   // indices indicate that the top-left element is in a region added by padding.
   const ivec2 ipos = pos.xy * stride - padding;
@@ -109,18 +115,19 @@ void main() {
       for (int j = 0; j < TILE_SIZE; j++, kx++) {
         prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);
         for (int s = 0; s < BATCH_SIZE_X; s++) {
-            sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);
+          sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);
         }
       }
     }
   }
 
+  const ivec3 out_pos = pos_shared[gl_LocalInvocationIndex];
   for (int y = 0; y < BATCH_SIZE_Y; y++) {
     for (int x = 0; x < BATCH_SIZE_X; x++) {
-      if (any(greaterThanEqual(ivec3(pos.x + x, pos.y + y, pos.z), out_limits))) {
+      if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits))) {
         continue;
       }
-      imageStore(t_out, ivec3(pos.x + x, pos.y + y, pos.z), op(sum[y][x], out_min, out_max));
+      imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
     }
   }
 }