ModelTC
diff --git a/‎lightllm/models/llama/triton_kernel/gqa_flash_decoding.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/models/llama/triton_kernel/gqa_flash_decoding.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/models/llama/triton_kernel/gqa_flash_decoding_vsm.py‎
Lines changed: 33 additions & 23 deletions b/‎lightllm/models/llama/triton_kernel/gqa_flash_decoding_vsm.py‎
Lines changed: 33 additions & 23 deletions
diff --git a/‎test/kernel/alignment/llama_gqa_decode_vsm.py‎
Lines changed: 69 additions & 66 deletions b/‎test/kernel/alignment/llama_gqa_decode_vsm.py‎
Lines changed: 69 additions & 66 deletions
diff --git a/‎…t/kernel/deepseekv2_gqa_decode_tuning.py‎ ‎…l/tuning/deepseekv2_gqa_decode_tuning.py‎test/kernel/deepseekv2_gqa_decode_tuning.py renamed to test/kernel/tuning/deepseekv2_gqa_decode_tuning.py b/‎…t/kernel/deepseekv2_gqa_decode_tuning.py‎ ‎…l/tuning/deepseekv2_gqa_decode_tuning.py‎test/kernel/deepseekv2_gqa_decode_tuning.py renamed to test/kernel/tuning/deepseekv2_gqa_decode_tuning.py
@@ -12,7 +12,7 @@ def gqa_token_decode_attention_flash_decoding(
     from .gqa_flash_decoding_stage1 import flash_decode_stage1
     from .gqa_flash_decoding_stage2 import flash_decode_stage2
 
-    o_tensor = alloc_tensor_func(q.shape, q.dtype, q.device) if out is None else out
+    o_tensor = alloc_tensor_func(q.shape, dtype=q.dtype, device=q.device) if out is None else out
 
     mid_o = alloc_tensor_func(
         [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 1, head_dim], dtype=torch.float32, device="cuda"
 
@@ -44,12 +44,12 @@ def try_to_get_best_config(
             return config
         else:
             config = {
-                "BLOCK_N": 16,
+                "BLOCK_N": 64,
                 "BLOCK_Q_HEAD": 16,
                 "stage1_num_warps": 4,
                 "stage1_num_stages": 2,
                 "stage2_num_warps": 4,
-                "stage2_num_stages": 2,
+                "stage2_num_stages": 1,
             }
         return config
 
@@ -150,38 +150,45 @@ def _kernel_gqa_token_decode_attention_flash_decoding_vsm_stage1(
         mid_o_logexpsum: [q_head_num, total_seq_block_num]
     """
     sm_id = tl.program_id(0).to(tl.int64)
-    block_size = tl.load(block_size, eviction_policy="evict_last")
+    block_size = tl.load(block_size)
 
     out_batch_start_index = tl.cast(0, tl.int64)
     q_head_off = tl.arange(0, Q_HEAD_NUM)
     d_off = tl.arange(0, BLOCK_DMODEL)
 
-    for cur_batch in tl.range(0, batch_size, 1):
-        cur_req_idx = tl.load(b_req_idx + cur_batch, eviction_policy="evict_last")
-        cur_seq_len = tl.load(b_seq_len + cur_batch, eviction_policy="evict_last")
+    for cur_batch in range(0, batch_size):
+        cur_req_idx = tl.load(b_req_idx + cur_batch)
+        cur_seq_len = tl.load(b_seq_len + cur_batch)
 
         cur_num_of_blocks = tl.cdiv(cur_seq_len, block_size)
         cur_num_of_kv_head_pairs = cur_num_of_blocks * kv_head_num
 
-        loop_sm_id = sm_id
-        while loop_sm_id < cur_num_of_kv_head_pairs:
-            cur_block_idx = loop_sm_id // kv_head_num
-            cur_kv_head_idx = loop_sm_id % kv_head_num
+        # loop_sm_id = sm_id
+        while sm_id < cur_num_of_kv_head_pairs:
+            cur_block_idx = sm_id % cur_num_of_blocks
+            cur_kv_head_idx = sm_id // cur_num_of_blocks
+            # cur_block_idx = sm_id // kv_head_num
+            # cur_kv_head_idx = sm_id % kv_head_num
 
-            cur_q_start = cur_kv_head_idx * gqa_group_size
-            cur_q_range = cur_q_start + q_head_off
+            cur_q_range = cur_kv_head_idx * gqa_group_size + q_head_off
             cur_q_mask = q_head_off < gqa_group_size
-            q_off = cur_batch * stride_q_bs + cur_q_range[:, None] * stride_q_h + d_off[None, :]
-            q_tensor = tl.load(q + q_off, mask=cur_q_mask[:, None], other=0.0)  # shape: [Q_HEAD_NUM, BLOCK_DMODEL]
 
             cur_kv_start = cur_block_idx * block_size
-            cur_kv_end = tl.minimum(cur_kv_start + block_size, cur_seq_len)
+
+            q_off = cur_batch * stride_q_bs + cur_q_range[:, None] * stride_q_h + d_off[None, :]
+            q_tensor = tl.load(
+                q + q_off,
+                mask=cur_q_mask[:, None],
+                other=0.0,
+            )  # shape: [Q_HEAD_NUM, BLOCK_DMODEL]
 
             sum_exp = tl.zeros([Q_HEAD_NUM], dtype=tl.float32)
             max_exp = tl.zeros([Q_HEAD_NUM], dtype=tl.float32) - float("inf")
             accumu = tl.zeros([Q_HEAD_NUM, BLOCK_DMODEL], dtype=tl.float32)
 
-            for chunk_idx in tl.range(0, tl.cdiv(cur_kv_end - cur_kv_start, BLOCK_N), 1, num_stages=NUM_STAGES):
+            cur_total_chunk = tl.cdiv(tl.minimum(cur_kv_start + block_size, cur_seq_len) - cur_kv_start, BLOCK_N)
+
+            for chunk_idx in tl.range(0, cur_total_chunk, 1, num_stages=NUM_STAGES):
                 cur_chunk_start = cur_kv_start + chunk_idx * BLOCK_N
                 cur_chunk_range = cur_chunk_start + tl.arange(0, BLOCK_N)
                 cur_chunk_mask = cur_chunk_range < cur_seq_len
@@ -196,10 +203,10 @@ def _kernel_gqa_token_decode_attention_flash_decoding_vsm_stage1(
                 k_off = (
                     cur_kv_loc[None, :] * stride_k_bs + cur_kv_head_idx * stride_k_h + d_off[:, None]
                 )  # shape: [BLOCK_DMODEL, BLOCK_N]
+                v_off = cur_kv_loc[:, None] * stride_v_bs + cur_kv_head_idx * stride_v_h + d_off[None, :]
                 k_tensor = tl.load(k + k_off, mask=cur_chunk_mask[None, :], other=0.0)
+
                 att_tensor = tl.dot(q_tensor, k_tensor)  # shape: [Q_HEAD_NUM, BLOCK_N]
-                v_off = cur_kv_loc[:, None] * stride_v_bs + cur_kv_head_idx * stride_v_h + d_off[None, :]
-                v_tensor = tl.load(v + v_off, mask=cur_chunk_mask[:, None], other=0.0)  # shape: [BLOCK_N, BLOCK_DMODEL]
                 att_tensor *= softmax_scale
                 att_tensor = tl.where(cur_chunk_mask[None, :], att_tensor, float("-inf"))
 
@@ -209,7 +216,8 @@ def _kernel_gqa_token_decode_attention_flash_decoding_vsm_stage1(
                 exp_logic = tl.exp(att_tensor - new_max[:, None])
                 log_scale = tl.exp(max_exp - new_max)
                 accumu *= log_scale[:, None]
-                accumu += tl.dot(exp_logic, v_tensor.to(accumu.dtype))
+                v_tensor = tl.load(v + v_off, mask=cur_chunk_mask[:, None], other=0.0)  # shape: [BLOCK_N, BLOCK_DMODEL]
+                accumu += tl.dot(exp_logic.to(v_tensor.dtype), v_tensor)
 
                 sum_exp = sum_exp * log_scale + tl.sum(exp_logic, axis=1)
                 max_exp = new_max
@@ -223,12 +231,14 @@ def _kernel_gqa_token_decode_attention_flash_decoding_vsm_stage1(
                 cur_q_range * stride_mid_o_logexpsum_h
                 + (out_batch_start_index + cur_block_idx) * stride_mid_o_logexpsum_seq
             )
+            max_exp = max_exp + tl.log(sum_exp)
             tl.store(
                 mid_o_logexpsum + off_mid_o_logexpsum,
-                max_exp + tl.log(sum_exp),
+                max_exp,
                 mask=cur_q_mask,
             )
-            loop_sm_id += num_sm
+            sm_id += num_sm
+        sm_id -= cur_num_of_kv_head_pairs
         out_batch_start_index += cur_num_of_blocks
 
 
@@ -276,7 +286,7 @@ def gqa_token_decode_attention_flash_decoding_vsm_stage1(
         *mid_o.stride(),
         *mid_o_logexpsum.stride(),
         BLOCK_N=run_config["BLOCK_N"],
-        Q_HEAD_NUM=max(run_config["BLOCK_Q_HEAD"], triton.next_power_of_2(q_head_num)),
+        Q_HEAD_NUM=max(16, triton.next_power_of_2(gqa_group_size)),
         BLOCK_DMODEL=q.shape[-1],
         NUM_STAGES=run_config["stage1_num_stages"],
         num_stages=run_config["stage1_num_stages"],
@@ -424,7 +434,7 @@ def gqa_token_decode_attention_flash_decoding_vsm(
             out_dtype=q.dtype,
         )
 
-    if not out:
+    if out is None:
         out = alloc_tensor_func(q.shape, dtype=q.dtype, device=q.device)
 
     num_vsm = emstimate_stage1_vsm(
 
@@ -1,6 +1,7 @@
 import unittest
 import random
 import torch
+from tqdm import tqdm
 from lightllm.common.basemodel.infer_struct import InferStateInfo
 from lightllm.common.req_manager import ReqManager
 from lightllm.models.llama.triton_kernel.gqa_flash_decoding_vsm import (
@@ -20,81 +21,83 @@ def test_vsm_gqa_decoding_align(self):
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
 
-        bs_list = range(1, 40)
+        bs_list = [1, 8, 16, 32, 64, 128, 256]
         group_size_list = [16, 32, 64]
-        seq_len_list = [128, 512, 1024, 2048]
+        seq_len_list = [128, 512, 1024, 2048, 4096, 8192]
         q_head_dim_list = [64, 128]
-        q_head_num_list = [16, 32, 64]
+        q_head_num_list = [8, 16, 32]
 
-        for bs in bs_list:
-            for group_size in group_size_list:
-                for seq_len_m in seq_len_list:
-                    for q_head_dim in q_head_dim_list:
-                        for q_head_num in q_head_num_list:
-                            if q_head_num < group_size:
-                                continue
-                            kv_head_num = q_head_num // group_size
-                            q_head_dim = q_head_dim
-                            kv_head_dim = q_head_dim
-                            seq_len = (torch.zeros(bs, dtype=torch.int32) + seq_len_m).to(torch.int32)
-                            total_token_in_the_batch = seq_len.sum().item()
-                            rounded_total_token_in_the_batch = (total_token_in_the_batch + 128 - 1) // 128 * 128
+        def get_test_configs():
+            for bs in bs_list:
+                for group_size in group_size_list:
+                    for seq_len_m in seq_len_list:
+                        for q_head_dim in q_head_dim_list:
+                            for q_head_num in q_head_num_list:
+                                if q_head_num < group_size:
+                                    continue
+                                yield bs, group_size, seq_len_m, q_head_dim, q_head_num
 
-                            q_shape = [bs, q_head_num, q_head_dim]
-                            kv_shape = [
-                                rounded_total_token_in_the_batch,
-                                kv_head_num,
-                                kv_head_dim,
-                            ]
-                            qkv_dtype = torch.float16
+        for bs, group_size, seq_len_m, q_head_dim, q_head_num in tqdm(list(get_test_configs())):
+            kv_head_num = q_head_num // group_size
+            q_head_dim = q_head_dim
+            kv_head_dim = q_head_dim
+            seq_len = (torch.zeros(bs, dtype=torch.int32) + seq_len_m).to(torch.int32)
+            total_token_in_the_batch = seq_len.sum().item()
+            rounded_total_token_in_the_batch = (total_token_in_the_batch + 128 - 1) // 128 * 128
 
-                            q, k, v = (
-                                torch.randn(q_shape, dtype=qkv_dtype, device="cuda"),
-                                torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
-                                torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
-                            )
-                            q, k, v = q / 10, k / 10, v / 10
+            q_shape = [bs, q_head_num, q_head_dim]
+            kv_shape = [
+                rounded_total_token_in_the_batch,
+                kv_head_num,
+                kv_head_dim,
+            ]
+            qkv_dtype = torch.float16
 
-                            req_to_token_index = torch.zeros((bs, 2048)) - 1
-                            token_index = torch.arange(rounded_total_token_in_the_batch)
+            q, k, v = (
+                torch.randn(q_shape, dtype=qkv_dtype, device="cuda"),
+                torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
+                torch.randn(kv_shape, dtype=qkv_dtype, device="cuda"),
+            )
+            q, k, v = q / 10, k / 10, v / 10
 
-                            total_count = 0
-                            for i in range(bs):
-                                req_to_token_index[i, : seq_len[i]] = token_index[
-                                    total_count : total_count + seq_len[i]
-                                ]
-                                total_count += seq_len[i]
+            req_to_token_index = torch.zeros((bs, seq_len_m)) - 1
+            token_index = torch.arange(rounded_total_token_in_the_batch)
 
-                            req_to_token_index = req_to_token_index.long().cuda()
+            total_count = 0
+            for i in range(bs):
+                req_to_token_index[i, : seq_len[i]] = token_index[total_count : total_count + seq_len[i]]
+                total_count += seq_len[i]
 
-                            b_req_idx = torch.arange(bs, device="cuda")
-                            infer_state = InferStateInfo()
-                            infer_state.req_manager = ReqManager(bs, 2048, None)
-                            infer_state.req_manager.req_to_token_indexs = req_to_token_index
-                            infer_state.b_req_idx = b_req_idx.cuda()
-                            infer_state.b_seq_len = seq_len.cuda()
-                            infer_state.max_len_in_batch = 2048
-                            infer_state.batch_size = bs
-                            infer_state.q_head_num = q_head_num
-                            infer_state.q_head_dim = q_head_dim
-                            infer_state.kv_head_num = kv_head_num
-                            infer_state.softmax_scale = 1 / (q_head_dim ** 0.5)
-                            infer_state.total_token_num = torch.tensor(
-                                [total_token_in_the_batch], dtype=torch.int32
-                            ).cuda()
-                            new_out = gqa_token_decode_attention_flash_decoding_vsm(q, k, v, infer_state)
-                            old_out = gqa_token_decode_attention_flash_decoding(
-                                q,
-                                infer_state,
-                                infer_state.q_head_num,
-                                infer_state.q_head_dim,
-                                k,
-                                v,
-                            )
-                            cos_sim = (
-                                torch.nn.functional.cosine_similarity(new_out, old_out, dim=-1).mean().cpu().item()
-                            )
-                            self.assertGreaterEqual(cos_sim, 0.99)
+            req_to_token_index = req_to_token_index.long().cuda()
+
+            b_req_idx = torch.arange(bs, device="cuda")
+            infer_state = InferStateInfo()
+            infer_state.req_manager = ReqManager(bs, 2048, None)
+            infer_state.req_manager.req_to_token_indexs = req_to_token_index
+            infer_state.b_req_idx = b_req_idx.cuda()
+            infer_state.b_seq_len = seq_len.cuda()
+            infer_state.max_len_in_batch = seq_len_m
+            infer_state.batch_size = bs
+            infer_state.q_head_num = q_head_num
+            infer_state.q_head_dim = q_head_dim
+            infer_state.kv_head_num = kv_head_num
+            infer_state.softmax_scale = 1 / (q_head_dim ** 0.5)
+            infer_state.total_token_num = torch.tensor([total_token_in_the_batch], dtype=torch.int32).cuda()
+            new_out = gqa_token_decode_attention_flash_decoding_vsm(q, k, v, infer_state)
+            old_out = gqa_token_decode_attention_flash_decoding(
+                q,
+                infer_state,
+                infer_state.q_head_num,
+                infer_state.q_head_dim,
+                k,
+                v,
+            )
+            cos_sim = torch.nn.functional.cosine_similarity(new_out, old_out, dim=-1).mean().cpu().item()
+            self.assertGreaterEqual(
+                cos_sim,
+                0.9,
+                f"bs={bs}, group_size={group_size}, seq_len={seq_len_m}, q_head_dim={q_head_dim}, q_head_num={q_head_num}",
+            )
 
 
 if __name__ == "__main__":