Update on "[ET-VK] Using shared memory to save position in conv2d dw output op."

trivedivivek · trivedivivek · commit 82889245ca23 · 2025-01-24T12:32:29.000-08:00
This diff introduces a change to conv2d dw op to save output positions in shared memory, which reduces register usage and improves performance. Differential Revision: [D68400890](https://our.internmc.facebook.com/intern/diff/D68400890/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -20,6 +20,8 @@
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
 
+#define LOCAL_WG_SIZE 64
+
 #define op(X, A, B) ${OPERATOR}
 
 #include "indexing_utils.h"
@@ -38,12 +40,10 @@ ${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
 #define offset_pos_index(index) (index + ((index) >> 4))
-
-// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-// 64 is the number of threads in the local wg
-shared ivec3 pos_shared[offset_pos_index(64)];
+shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
 
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
 #define offset_pos_index(index) (index + ((index) >> 4))
-
-// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-// 64 is the number of threads in the local wg
 shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
 
 /*