Converting all uint16 to int in quantized mat mul shader to improve perf.

trivedivivek · lucylq · commit 3f57577298da · 2025-10-20T12:22:50.000-07:00
Differential Revision: D84777696 Pull Request resolved: #15193
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -21,8 +21,6 @@ ${define_required_extensions(DTYPE)}
 $if WEIGHT_STORAGE == "buffer":
   ${define_required_extensions("int8")}
 
-#extension GL_EXT_control_flow_attributes : require
-
 layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
@@ -49,20 +47,18 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 void main() {
   // txcol stands for "texel column". One txcol corresponds to 4 scalar columns.
   $if TILE_TXCOLS > 1:
-    const uint16_t global_wg_x = uint16_t(divup(out_sizes.x, 4 * TILE_TXCOLS));
-    const uint16_t out_txcol = uint16_t(
-      (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS);
+    const int global_wg_x = divup(out_sizes.x, 4 * TILE_TXCOLS);
+    const int out_txcol = (int(gl_GlobalInvocationID.x) % global_wg_x) * TILE_TXCOLS;
   $else:
-    const uint16_t global_wg_x = uint16_t(divup4(out_sizes.x));
-    const uint16_t out_txcol = uint16_t(gl_GlobalInvocationID.x % global_wg_x);
+    const int global_wg_x = divup4(out_sizes.x);
+    const int out_txcol = int(gl_GlobalInvocationID.x) % global_wg_x;
 
-  const uint16_t out_row = uint16_t(
-    (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS);
+  const int out_row = (int(gl_GlobalInvocationID.x) / global_wg_x) * TILE_ROWS;
 
   $if QUANT_NBITS == 4:
-    const uint16_t weight_txcol = uint16_t(out_txcol / 2);
+    const int weight_txcol = out_txcol / 2;
 
-  if (out_row >= uint16_t(out_sizes.y)) {
+  if (out_row >= int(out_sizes.y)) {
     return;
   }
 
@@ -73,9 +69,9 @@ void main() {
       sums[r][${c}] = VEC4_T(0.0);
   }
 
-  for (uint16_t pos = uint16_t(0), txpos = uint16_t(0);
-       pos < uint16_t(in_sizes.x);
-       pos += uint16_t(4), txpos += uint16_t(1)) {
+  for (int pos = 0, txpos = 0;
+       pos < in_sizes.x;
+       pos += 4, txpos += 1) {
 
     T mat1[TILE_ROWS][4];
 
@@ -91,7 +87,7 @@ void main() {
         mat1[i][2] = tmp.z;
         mat1[i][3] = tmp.w;
       $else:
-        VEC4_T tmp = VEC4_T(texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0));
+        VEC4_T tmp = VEC4_T(texelFetch(t_in, ivec3(txpos, out_row + i, 0), 0));
         mat1[i][0] = tmp.x;
         mat1[i][1] = tmp.y;
         mat1[i][2] = tmp.z;
@@ -117,7 +113,7 @@ void main() {
             packed_weight_tex = t_weight[qmat2_bufi + ${c}]
           $else:
             packed_weight_tex = texelFetch(
-              t_weight, u16vec2(weight_txcol + ${c}, pos + r), 0);
+              t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
 
           qmat2[${c}] = (VEC4_T(packed_weight_tex >> 4) - 8.0);
           qmat2[${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0);
@@ -128,7 +124,7 @@ void main() {
             qmat2[${c}] = t_weight[qmat2_bufi + ${c}];
           $else:
             qmat2[${c}] = VEC4_T(
-              texelFetch(t_weight, u16vec2(out_txcol + ${c}, pos + r), 0));
+              texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
 
       for (int tr = 0; tr < TILE_ROWS; ++tr) {
         $for c in range(TILE_TXCOLS):
@@ -143,7 +139,7 @@ void main() {
       scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]);
     $else:
       scales[${c}] = VEC4_T(
-        texelFetch(t_scales, u16vec2(out_txcol + ${c}, 0), 0));
+        texelFetch(t_scales, ivec2(out_txcol + ${c}, 0), 0));
 
   // Store to output tensor
   $if OUT_STORAGE == "buffer":
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -98,7 +98,10 @@ Error TextLLMRunner::generate(
   std::function<void(const std::string&)> wrapped_callback =
       [token_callback, config](const std::string& piece) {
         if (!config.warming) {
+          // llm::safe_printf("\033[32m");
           llm::safe_printf(piece.c_str());
+          // llm::safe_printf("\033[0m\n");
+          // \033[32mThis text is green.\033[0m\n
           fflush(stdout);
         }
         if (token_callback) {
@@ -169,6 +172,11 @@ Error TextLLMRunner::generate(
   stats_->first_token_ms = time_in_ms();
   stats_->prompt_eval_end_ms = time_in_ms();
 
+  RUNNER_ET_LOG(
+      config.warming,
+      "RSS after prompt prefill: %f MiB (0 if unsupported)",
+      get_rss_bytes() / 1024.0 / 1024.0);
+
   // print the first token from prefill. No prev_token so use cur_token for it.
   auto decode_result = tokenizer_->decode(cur_token, cur_token);
   if (!decode_result.ok()) {
@@ -179,10 +187,6 @@ Error TextLLMRunner::generate(
     return ::executorch::runtime::Error::InvalidArgument;
   }
   wrapped_callback(std::move(*decode_result));
-  RUNNER_ET_LOG(
-      config.warming,
-      "RSS after prompt prefill: %f MiB (0 if unsupported)",
-      get_rss_bytes() / 1024.0 / 1024.0);
 
   // start the main loop
   prompt_tokens.push_back(cur_token);
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -128,9 +128,13 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       if (eos_ids_->find(cur_token) != eos_ids_->end()) {
         printf("\n");
         ET_LOG(Info, "\nReached to the end of generation");
-        break;
+        return pos - start_pos;
       }
     }
+    ET_LOG(
+        Info,
+        "\nFinished generation. Generated %" PRIi32 " tokens.",
+        start_pos + max_new_tokens);
     return pos - start_pos;
   }
 

Original file line number	Diff line number	Diff line change
`@@ -128,9 +128,13 @@ class ET_EXPERIMENTAL TextTokenGenerator {`
`128`	`128`	`if (eos_ids_->find(cur_token) != eos_ids_->end()) {`
`129`	`129`	`printf("\n");`
`130`	`130`	`ET_LOG(Info, "\nReached to the end of generation");`
`131`		`- break;`
	`131`	`+ return pos - start_pos;`
`132`	`132`	`}`
`133`	`133`	`}`
	`134`	`+ ET_LOG(`
	`135`	`+ Info,`
	`136`	`+ "\nFinished generation. Generated %" PRIi32 " tokens.",`
	`137`	`+ start_pos + max_new_tokens);`
`134`	`138`	`return pos - start_pos;`
`135`	`139`	`}`
`136`	`140`