[ET-VK] Improve q8 matmul by increasing TILE_N4 (#14610)

pytorchbot · ssjia · web-flow · commit 54f5ffe83c97 · 2025-09-25T16:44:16.000-04:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #14597 by @SS-JIA ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/SS-JIA/331/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/SS-JIA/331/head Merge bot PR base: https://github.com/pytorch/executorch/tree/gh/SS-JIA/329/orig Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/SS-JIA/331/orig Differential Revision: [D83253129](https://our.internmc.facebook.com/intern/diff/D83253129/) @diff-train-skip-merge Co-authored-by: ssjia <ssjia@devvm1479.ncg0.facebook.com>
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
@@ -75,7 +75,7 @@ void accumulate_out_tile_with_int_accum(
           input_zp_vec * weight_sums.data[n4] + accum.data[m][n4];
       out_tile.data[m][n4] =
           fma(VEC4_T(accum_adjusted),
-              VEC4_T(input_q_scale * weight_scales.data[0]),
+              VEC4_T(input_q_scale * weight_scales.data[n4]),
               out_tile.data[m][n4]);
     }
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
@@ -11,7 +11,7 @@ linear_q8ta_q8csw_tiled:
     PACKED_INT8_INPUT_STORAGE: buffer
     WEIGHT_STORAGE: texture2d
     TILE_M4: 1
-    TILE_N4: 1
+    TILE_N4: 2
     TILE_K4: 1
   generate_variant_forall:
     DTYPE:
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -77,6 +77,10 @@ utils::uvec3 quantized_linear_global_wg_size(
     M_per_tile = 1;
   }
 
+  if (shader.kernel_name.find("q8ta_q8csw_tiled") != std::string::npos) {
+    N_per_tile = 8;
+  }
+
   const uint32_t num_N_tiles = utils::div_up(N, N_per_tile);
   const uint32_t num_M_tiles = utils::div_up(M, M_per_tile);
 

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ void accumulate_out_tile_with_int_accum(`
`75`	`75`	`input_zp_vec * weight_sums.data[n4] + accum.data[m][n4];`
`76`	`76`	`out_tile.data[m][n4] =`
`77`	`77`	`fma(VEC4_T(accum_adjusted),`
`78`		`- VEC4_T(input_q_scale * weight_scales.data[0]),`
	`78`	`+ VEC4_T(input_q_scale * weight_scales.data[n4]),`
`79`	`79`	`out_tile.data[m][n4]);`
`80`	`80`	`}`
`81`	`81`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,10 @@ utils::uvec3 quantized_linear_global_wg_size(`
`77`	`77`	`M_per_tile = 1;`
`78`	`78`	`}`
`79`	`79`
	`80`	`+ if (shader.kernel_name.find("q8ta_q8csw_tiled") != std::string::npos) {`
	`81`	`+ N_per_tile = 8;`
	`82`	`+ }`
	`83`	`+`
`80`	`84`	`const uint32_t num_N_tiles = utils::div_up(N, N_per_tile);`
`81`	`85`	`const uint32_t num_M_tiles = utils::div_up(M, M_per_tile);`
`82`	`86`