[ET-VK] Removed shared memory usage and simplied conv2d dw op shader to improve performance.

trivedivivek · trivedivivek · commit 7099bcdad6e3 · 2025-05-30T10:16:06.000-07:00
Pull Request resolved: #11178 This diff removes shared memory usage in `conv2d_dw_output_tile.glsl` shader to improve performance. Makes sum a one dimensional array, and moves bias application before storing texel. ghstack-source-id: 287222799 @exported-using-ghexport Differential Revision: [D75499165](https://our.internmc.facebook.com/intern/diff/D75499165/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -47,11 +47,6 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// For performance improvement, reduce register usage by caching positions in shared memory.
-// Offset index by 1 every 16 points to avoid bank access conflict.
-#define offset_pos_index(index) (index + ((index) >> 4))
-shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
-
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
@@ -77,8 +72,6 @@ void main() {
     return;
   }
 
-  pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos;
-
   // Compute the index of the top-left element of the overlay region. Negative
   // indices indicate that the top-left element is in a region added by padding.
   const ivec2 ipos = pos.xy * stride - padding;
@@ -89,13 +82,10 @@ void main() {
   const ivec2 end = ipos + overlay_region.xy;
 
   // sum outputs
-  VEC4_T sum[BATCH_SIZE_Y][BATCH_SIZE_X];
+  VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];
 
-  sum[0][0] = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  for (int y = 0; y < BATCH_SIZE_Y; y++) {
-    for (int x = 0; x < BATCH_SIZE_X; x++) {
-      sum[y][x] = sum[0][0];
-    }
+  for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) {
+    sum[i] = VEC4_T(0);
   }
 
   // array to store input texels
@@ -115,7 +105,7 @@ void main() {
     if (i > 0) {
       for (int j = 0; j < TILE_SIZE; j++) {
         for (int s = 0; s < BATCH_SIZE_X; s++) {
-          sum[1][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[1][s]);
+          sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]);
         }
       }
     }
@@ -125,19 +115,19 @@ void main() {
       for (int j = 0; j < TILE_SIZE; j++, kx++) {
         prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);
         for (int s = 0; s < BATCH_SIZE_X; s++) {
-          sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);
+          sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]);
         }
       }
     }
   }
 
-  const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)];
+  const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   for (int y = 0; y < BATCH_SIZE_Y; y++) {
     for (int x = 0; x < BATCH_SIZE_X; x++) {
-      if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {
-        continue;
+      const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z);
+      if (all(lessThan(out_pos.xy, out_limits.xy))) {
+        imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max));
       }
-      imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -47,11 +47,6 @@ layout(push_constant) uniform restrict Block {`
`47`	`47`
`48`	`48`	`layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;`
`49`	`49`
`50`		`-// For performance improvement, reduce register usage by caching positions in shared memory.`
`51`		`-// Offset index by 1 every 16 points to avoid bank access conflict.`
`52`		`-#define offset_pos_index(index) (index + ((index) >> 4))`
`53`		`-shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];`
`54`		`-`
`55`	`50`	`/*`
`56`	`51`	`* Computes a depthwise convolution. Each shader invocation calculates the`
`57`	`52`	`* output at a single output location.`
`@@ -77,8 +72,6 @@ void main() {`
`77`	`72`	`return;`
`78`	`73`	`}`
`79`	`74`
`80`		`- pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos;`
`81`		`-`
`82`	`75`	`// Compute the index of the top-left element of the overlay region. Negative`
`83`	`76`	`// indices indicate that the top-left element is in a region added by padding.`
`84`	`77`	`const ivec2 ipos = pos.xy * stride - padding;`
`@@ -89,13 +82,10 @@ void main() {`
`89`	`82`	`const ivec2 end = ipos + overlay_region.xy;`
`90`	`83`
`91`	`84`	`// sum outputs`
`92`		`- VEC4_T sum[BATCH_SIZE_Y][BATCH_SIZE_X];`
	`85`	`+ VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];`
`93`	`86`
`94`		`- sum[0][0] = texelFetch(t_bias, ivec2(pos.z, 0), 0);`
`95`		`- for (int y = 0; y < BATCH_SIZE_Y; y++) {`
`96`		`- for (int x = 0; x < BATCH_SIZE_X; x++) {`
`97`		`- sum[y][x] = sum[0][0];`
`98`		`- }`
	`87`	`+ for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) {`
	`88`	`+ sum[i] = VEC4_T(0);`
`99`	`89`	`}`
`100`	`90`
`101`	`91`	`// array to store input texels`
`@@ -115,7 +105,7 @@ void main() {`
`115`	`105`	`if (i > 0) {`
`116`	`106`	`for (int j = 0; j < TILE_SIZE; j++) {`
`117`	`107`	`for (int s = 0; s < BATCH_SIZE_X; s++) {`
`118`		`- sum[1][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[1][s]);`
	`108`	`+ sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]);`
`119`	`109`	`}`
`120`	`110`	`}`
`121`	`111`	`}`
`@@ -125,19 +115,19 @@ void main() {`
`125`	`115`	`for (int j = 0; j < TILE_SIZE; j++, kx++) {`
`126`	`116`	`prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);`
`127`	`117`	`for (int s = 0; s < BATCH_SIZE_X; s++) {`
`128`		`- sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]);`
	`118`	`+ sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]);`
`129`	`119`	`}`
`130`	`120`	`}`
`131`	`121`	`}`
`132`	`122`	`}`
`133`	`123`
`134`		`- const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)];`
	`124`	`+ const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0);`
`135`	`125`	`for (int y = 0; y < BATCH_SIZE_Y; y++) {`
`136`	`126`	`for (int x = 0; x < BATCH_SIZE_X; x++) {`
`137`		`- if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {`
`138`		`- continue;`
	`127`	`+ const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z);`
	`128`	`+ if (all(lessThan(out_pos.xy, out_limits.xy))) {`
	`129`	`+ imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max));`
`139`	`130`	`}`
`140`		`- imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));`
`141`	`131`	`}`
`142`	`132`	`}`
`143`	`133`	`}`