From 7e168c54598618316b008eb53e1d524673618fec Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Sun, 27 Apr 2025 20:43:37 -0700
Subject: [PATCH 1/5] [ET-VK] Modify quantized linear tiling shader to linearly
 dispatch work to improve thread occupancy and performance.

This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance.

Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl | 9 ++++++---
 .../vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml | 4 ++--
 .../runtime/graph/ops/impl/QuantizedLinearInt8.cpp       | 8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
index 8a8670b4bb3..5a345fe6599 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
@@ -36,13 +36,16 @@ layout(push_constant) uniform restrict Block {
   ivec4 weight_sizes;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 2;
+  const uint out_size_x_div_4 = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_size_x_div_4) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_size_x_div_4) * TILE_ROWS;
 
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+  if (out_row >= out_sizes.y) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml
index 1e8a5e1fe7d..941836b48c4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.yaml
@@ -16,10 +16,10 @@ q_8w_linear_tiled:
     TILE_ROWS:
       - VALUE: 1
         SUFFIX: o4x1
+      - VALUE: 2
+        SUFFIX: o4x2
       - VALUE: 4
         SUFFIX: o4x4
-      - VALUE: 6
-        SUFFIX: o4x6
   shader_variants:
     - NAME: q_8w_linear_tiled_texture3d_texture3d_texture2d_texture2d_float
     - NAME: q_8w_linear_tiled_buffer_buffer_texture2d_texture2d_float
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
index 4a10f469be0..50a9828a0f8 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -182,8 +182,8 @@ void add_q_8w_linear_tiled_node(
   const int64_t M = utils::val_at(-2, mat1_sizes);
   int out_tile_nrows = 4;
   if (M % 6 == 0) {
-    kernel_name += "_o4x6";
-    out_tile_nrows = 6;
+    kernel_name += "_o4x2";
+    out_tile_nrows = 2;
   } else if (M % 4 == 0) {
     kernel_name += "_o4x4";
     out_tile_nrows = 4;
@@ -197,6 +197,10 @@ void add_q_8w_linear_tiled_node(
 
   utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   global_wg_size[1] = global_wg_size[1] / out_tile_nrows;
+  if (!use_coop_algorithm) {
+    global_wg_size[0] *= global_wg_size[1];
+    global_wg_size[1] = 1;
+  }
 
   utils::uvec3 local_wg_size{64, 1, 1};
   if (use_coop_algorithm) {

From 16732ccc88c44f33ca7e77091ec679035dce3870 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 28 Apr 2025 06:52:54 -0700
Subject: [PATCH 2/5] Update on "[ET-VK] Modify quantized linear tiling shader
 to linearly dispatch work to improve thread occupancy and performance."

This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance.

Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/)

[ghstack-poisoned]

From 6bac0171ba367c36e266822925db6e74ea670725 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 28 Apr 2025 10:52:20 -0700
Subject: [PATCH 3/5] Update on "[ET-VK] Modify quantized linear tiling shader
 to linearly dispatch work to improve thread occupancy and performance."

This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance.

Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/)

[ghstack-poisoned]
---
 .../runtime/graph/ops/glsl/q_8w_linear_coop.glsl      |  9 ++++++---
 .../runtime/graph/ops/glsl/q_8w_linear_tiled.glsl     |  6 +++---
 .../runtime/graph/ops/impl/QuantizedLinearInt8.cpp    | 11 +++++------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
index c8ccbacffc1..3ad9e759910 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_coop.glsl
@@ -38,18 +38,21 @@ layout(push_constant) uniform restrict Block {
   ivec4 weight_sizes;
 };
 
+#include "indexing_utils.h"
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 shared VEC4_T partial_c[NGROUPS][NWORKERS][TILE_ROWS];
 
 void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 2;
+  const uint out_width_ntexels = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
 
   const int gid = int(gl_LocalInvocationID.x); // group id
   const int wid = int(gl_LocalInvocationID.z); // worker id
 
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
+  if (out_row >= out_sizes.y) {
     return;
   }
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
index 5a345fe6599..6d7995a77f0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear_tiled.glsl
@@ -41,9 +41,9 @@ layout(push_constant) uniform restrict Block {
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 void main() {
-  const uint out_size_x_div_4 = divup4(out_sizes.x);
-  const uint out_col = (gl_GlobalInvocationID.x % out_size_x_div_4) << 2;
-  const uint out_row = (gl_GlobalInvocationID.x / out_size_x_div_4) * TILE_ROWS;
+  const uint out_width_ntexels = divup4(out_sizes.x);
+  const uint out_col = (gl_GlobalInvocationID.x % out_width_ntexels) << 2;
+  const uint out_row = (gl_GlobalInvocationID.x / out_width_ntexels) * TILE_ROWS;
 
   if (out_row >= out_sizes.y) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
index 50a9828a0f8..f57ac1a042d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -195,12 +195,11 @@ void add_q_8w_linear_tiled_node(
     out_tile_nrows = 4;
   }
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[1] = global_wg_size[1] / out_tile_nrows;
-  if (!use_coop_algorithm) {
-    global_wg_size[0] *= global_wg_size[1];
-    global_wg_size[1] = 1;
-  }
+  utils::uvec3 out_limits = graph.logical_limits_of(out);
+  utils::uvec3 global_wg_size = {
+      out_limits[0] * (utils::div_up(out_limits, out_tile_nrows)),
+      1,
+      out_limit[2]};
 
   utils::uvec3 local_wg_size{64, 1, 1};
   if (use_coop_algorithm) {

From ef1ea857461cdaf18f632900d3b6a7d35023a578 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:07:43 -0700
Subject: [PATCH 4/5] Update on "[ET-VK] Modify quantized linear tiling shader
 to linearly dispatch work to improve thread occupancy and performance."

This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance.

Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/)

[ghstack-poisoned]
---
 .../vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
index f57ac1a042d..d7156ebef90 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearInt8.cpp
@@ -180,7 +180,7 @@ void add_q_8w_linear_tiled_node(
 
   std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
   const int64_t M = utils::val_at(-2, mat1_sizes);
-  int out_tile_nrows = 4;
+  uint32_t out_tile_nrows = 4;
   if (M % 6 == 0) {
     kernel_name += "_o4x2";
     out_tile_nrows = 2;
@@ -197,9 +197,9 @@ void add_q_8w_linear_tiled_node(
 
   utils::uvec3 out_limits = graph.logical_limits_of(out);
   utils::uvec3 global_wg_size = {
-      out_limits[0] * (utils::div_up(out_limits, out_tile_nrows)),
+      out_limits[0] * (utils::div_up(out_limits[1], out_tile_nrows)),
       1,
-      out_limit[2]};
+      out_limits[2]};
 
   utils::uvec3 local_wg_size{64, 1, 1};
   if (use_coop_algorithm) {

From ab1de82c6bd08f19e667f6411f24fe3dcf40aaae Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 28 Apr 2025 12:29:46 -0700
Subject: [PATCH 5/5] Update on "[ET-VK] Modify quantized linear tiling shader
 to linearly dispatch work to improve thread occupancy and performance."

This diff changes tiled 8 bit quantized linear mat mul op to linearly dispatch work which increases thread occupancy and improves performance.

Differential Revision: [D73751979](https://our.internmc.facebook.com/intern/diff/D73751979/)

[ghstack-poisoned]