Update base for Update on "[ET-VK] Using TmpTensor for width packed versions of q_linear op shader to reduce memory usage."

trivedivivek · trivedivivek · commit 1c5ae1227575 · 2025-01-24T12:32:31.000-08:00
This diff introduces the use of temporary tensors to reduce memory usage in the width packed versions of the q_linear op shader. Differential Revision: [D68561647](https://our.internmc.facebook.com/intern/diff/D68561647/) [ghstack-poisoned]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -20,6 +20,8 @@
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
 
+#define LOCAL_WG_SIZE 64
+
 #define op(X, A, B) ${OPERATOR}
 
 #include "indexing_utils.h"
@@ -47,12 +49,10 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
 #define offset_pos_index(index) (index + ((index) >> 4))
-
-// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-// 64 is the number of threads in the local wg
-shared ivec3 pos_shared[offset_pos_index(64)];
+shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)];
 
 /*
  * Computes a depthwise convolution. Each shader invocation calculates the
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -43,11 +43,9 @@ layout(push_constant) uniform restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// macro to offset shared memory access index. Padding position index by 1 offset per 16 positions avoidd bank access conflict and thus improves performance.
+// For performance improvement, reduce register usage by caching positions in shared memory.
+// Offset index by 1 every 16 points to avoid bank access conflict.
 #define offset_pos_index(index) (index + ((index) >> 4))
-
-// shared memory to hold calculated positions, this would reduce register usage thus improving performance.
-// 64 is the number of threads in the local wg
 shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE * TILE_SIZE_X * TILE_SIZE_Y)];
 
 /*
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -452,13 +452,13 @@ void add_conv2d_node(
       {{out, vkapi::MemoryAccessType::WRITE},
        {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      std::move(param_buffers),
+      param_buffers,
       // Specialization Constants
       {},
       // Resizing Logic
       resize_conv2d_node,
       {weight_data, stride, padding, dilation, transposed, output_padding},
-      std::move(push_constants)));
+      push_constants));
 }
 
 void add_conv1d_node(