[ET-VK][ez] Fix 8 bit linear compute shader dispatch

SS-JIA · SS-JIA · commit a41a618fd0b5 · 2025-03-26T09:00:43.000-07:00
Pull Request resolved: #9531 ## Context Currently, for the `q_8w_linear` shader, both the texture and the buffer variants use the same global work group and local work group setting. Specially, the global work group is set to `{out.numel(), 1, 1}` and the local work group is set to `{64, 1, 1}`. However, I believe this results in a very poor memory re-use for the texture shader. In this configuration: * Within a work group each invocation will be requesting a different row of A - 64 rows of A requested in total * All work groups will be requesting the same row of B * One work group will load 65 unique rows from A and B Compare this to a local work group size of `{8, 8, 1}` * Across the work group, 8 rows will be loaded from A and 8 rows will be loaded from B * One work group will load 16 unique rows total from A and B Evidently, there is better memory re-use in the latter work group as fewer unique rows are loaded. ## Changes Modify the `q_8w_linear` shader to use `{8, 8, 1}` local wg if possible. If `M` is small, then instead use `{4, 16, 1}` or `{2, 32, 1}` to reduce the number of inactive invocations. ghstack-source-id: 274198011 @exported-using-ghexport Differential Revision: [D71706489](https://our.internmc.facebook.com/intern/diff/D71706489/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -90,9 +90,10 @@ void main() {
 
 void main() {
   const u16vec2 out_pos = u16vec2(
-    gl_GlobalInvocationID.x / out_limits.y,
-    gl_GlobalInvocationID.x % out_limits.y);
-  if (out_pos.x >= out_limits.x) {
+    gl_GlobalInvocationID.x,
+    gl_GlobalInvocationID.y);
+
+  if (out_pos.x >= out_limits.x || out_pos.y >= out_limits.y) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -114,15 +114,37 @@ void add_q_8w_linear_node(
          graph.sizes_ubo(mat1_W_packed)});
   }
 
-  // set global work group size to be 1 dimensional
-  const utils::uvec3 wg_size = {
-      static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+  utils::uvec3 global_wg;
+  if (graph.is_buffer_storage(out)) {
+    global_wg = {static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+  } else {
+    global_wg = graph.logical_limits_of(out_W_packed);
+  }
+
+  utils::uvec3 local_wg{8, 8, 1};
+  int32_t out_W = graph.size_at<int32_t>(-1, out_W_packed);
+
+  if (graph.is_buffer_storage(out_W_packed)) {
+    local_wg[0] = 64;
+    local_wg[1] = 1;
+    local_wg[2] = 1;
+  } else {
+    if (out_W % 8 != 0) {
+      if (out_W % 4 == 0) {
+        local_wg[0] = 4;
+        local_wg[1] = 16;
+      } else {
+        local_wg[0] = 2;
+        local_wg[1] = 32;
+      }
+    }
+  }
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      wg_size,
-      graph.create_local_wg_size(wg_size),
+      global_wg,
+      local_wg,
       // Inputs and Outputs
       {{out_W_packed, vkapi::MemoryAccessType::WRITE},
        {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}},