[webgpu] Add back missing code comments for flash decoding (microsoft#25879)

xiaofeihan1 · web-flow · commit 5746ba9d3b7b · 2025-09-01T15:19:23.000+08:00
Restore accidentally removed comments when using WGSL template.
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_qkt.wgsl.template
@@ -6,6 +6,29 @@
 #param tile_size_k_vec
 #param sub_tile_count
 
+// Note that this shader adopts similar algorithm with dp4a generation shader.
+//
+// This algorithm works to compute dot product of keys with queries parallelly,
+// by processing on the k (head_size) dimension at each step amongst
+// tile_size_k_vec threads, and utilizing the remaining threads in the workgroup
+// to process additional rows of |present_key| in parallel (such that the values
+// in shared memory (tile_q) for |q| can be reused). For each load of q, the
+// tile_size_k_vec threads also reload |present_key| tile_size/sub_tile_count
+// times to compute partial dot products of other |present_key| rows in order to
+// complete all tile_size |present_key| rows in this workgroup and also reusing
+// the loaded in register values of |q|.
+
+// 1. Each workgroup processes one row of |q| and tile_size rows of |present_key|
+//
+// 2. Computation Process:
+//    - Reads [tile_size][tile_size_k_vec] block of |present_key| data at a time
+//    - Each thread within workgroup computes dot products of 4 A*B elements
+//      since each k represents 4 elements of |present_key|
+//    - Stores intermediate results in shared memory (inner_qk_values)
+//    - Iterates through columns (head_size_vec) accumulating results in
+//      inner_qk_values
+//    - Performs final reduction sum in inner_qk_values for output
+
 var<workgroup> tile_q: array<q_value_t, tile_size_k_vec>;
 var<workgroup> inner_qk_values: array<array<q_element_t, tile_size_k_vec>, tile_size>;
 var<workgroup> tile_qk: array<q_element_t, tile_size>;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_split_vx.wgsl.template
@@ -6,6 +6,33 @@
 #param tile_size_k_vec
 #param sub_tile_count
 
+// Note that this shader adopts similar algorithm with dp4a generation shader.
+//
+// This algorithm works to compute dot product of v with qk parallelly, by
+// processing on the head_size dimension at each step amongst tile_size_k_vec
+// threads, and utilizing the remaining threads in the workgroup to process
+// additional rows of |present_value| in parallel (such that the values in
+// shared memory (tile_qk) for |qk| can be reused). The tile_size_k_vec threads
+// also reload |present_value| tile_size/sub_tile_count times to compute partial
+// dot products of other |present_value| rows in order to complete all tile_size
+// |present_value| rows in this workgroup and also reusing the values in
+// tile_qk.
+//
+// The difference with FlashAttentionDecodeQKTProgram is that the dot products
+// go through the rows (total_sequence_length) of |present_value| instead of
+// columns (head_size_vec). And each workgroup only calculate current
+// tile_size's dot products instead of iterating the whole row
+// |total_sequence_length|. That's why this shader is a split shader. The final
+// reduce will be done in FlashAttentionDecodeReduceProgram.
+
+// TODO: Ideally, there should only be two shaders FlashAttentionDecodeSplitVx
+// and FlashAttentionDecodeVxReduce, which can also reduce the intermediate
+// memory. The FlashAttentionDecodeQKT can be merged into split shader and do
+// the final softmax adjustment in the reduce shader. However, some issues are
+// met that when the total sequence length exceeds some value, the result will
+// become garbage. Since it can't be resolved in a short time, leave it as TODO
+// to fix it in future.
+
 var<workgroup> tile_qk: array<present_value_element_t, tile_size>;
 var<workgroup> tile_output: array<present_value_value_t, head_size_vec>;
 var<workgroup> qkv_values: array<array<present_value_value_t, tile_size_k_vec>, sub_tile_count>;
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template b/onnxruntime/contrib_ops/webgpu/bert/flash_attention_decode_vx_reduce.wgsl.template
@@ -3,6 +3,17 @@
 
 #param tile_size
 
+// Inputs are splits of the GQA output, split into num_total_seq_length_tiles
+// rows. This shader needs to add these splits across the row dimension to
+// arrive at the final result. The column is head size wide. The reduction
+// achieves maximum parallelization by splitting this task first into tile_size
+// columns that each workgroup is responsible for. Then within each workgroup
+// the task of summation over the num_total_seq_length_tile for the tile_size
+// columns is further split in two ways. First across the row dimension to have
+// WORKGROUP_SIZE/TILE_SIZE parallel computations of summation of TILE_SIZE
+// rows. Then across the column dimension where each thread is responsible for 1
+// column of the TILE_SIZE columns the workgroup is responsible for.
+
 var<workgroup> tile_input: array<array<output_value_t, tile_size>, tile_size>;
 
 $MAIN {