add autotune

wangzaijun · wangzaijun · commit 0f7dedbaf975 · 2025-11-24T08:39:37.000Z
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 9}, "32": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}}, "8192": {"8": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 32, "num_warps": 8, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "256": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 11}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=16,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 9}, "32": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}}, "8192": {"8": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 32, "num_warps": 8, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "256": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 11}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 32, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 7}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "256": {"BLOCK_N": 32, "num_warps": 2, "num_stages": 11}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 11}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=4,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 32, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 7}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "256": {"BLOCK_N": 32, "num_warps": 2, "num_stages": 11}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 11}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "128": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 10}, "256": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 1}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=5,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "128": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 10}, "256": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 1}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,out_dtype=torch.bfloat16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 9}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "256": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 32, "num_warps": 2, "num_stages": 7}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 11}, "256": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}}}
diff --git a/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json b/lightllm/common/all_kernel_configs/_fwd_kernel_flash_decode_diverse_stage1:v1/{block_seq=256,gqa_group_size=8,out_dtype=torch.float16,q_head_dim=128}_NVIDIA_H200.json
@@ -0,0 +1 @@
+{"4096": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 9}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 1}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "256": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 2}}, "8192": {"8": {"BLOCK_N": 32, "num_warps": 2, "num_stages": 7}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 1}, "128": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 11}, "256": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}}}
diff --git a/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse_stage1.py b/lightllm/models/llama/triton_kernel/ppl_int8kv_flash_decoding_diverse_stage1.py
@@ -1,10 +1,77 @@
 import torch
 import triton
 import triton.language as tl
+from typing import Optional
+from lightllm.common.kernel_config import KernelConfigs
+from frozendict import frozendict
+from functools import lru_cache
+from typing import Dict
+
+
+class GQADiverseDecodeStage1KernelConfig(KernelConfigs):
+    kernel_name: str = "_fwd_kernel_flash_decode_diverse_stage1:v1"
+
+    @classmethod
+    @lru_cache(maxsize=200)
+    def try_to_get_best_config(
+        cls,
+        batch_size: int,
+        avg_seq_len_in_batch: int,
+        gqa_group_size: int,
+        q_head_dim: int,
+        block_seq: int,
+        out_dtype: str,
+    ) -> dict:
+        key_params = {
+            "gqa_group_size": gqa_group_size,
+            "q_head_dim": q_head_dim,
+            "block_seq": block_seq,
+            "out_dtype": str(out_dtype),
+        }
+        key_params = frozendict(key_params)
+
+        finded_config = cls.get_the_config(key_params)
+
+        if finded_config:
+            batch_size_config: dict = finded_config[
+                min(
+                    finded_config.keys(),
+                    key=lambda x: abs(int(x) - avg_seq_len_in_batch),
+                )
+            ]
+            config = batch_size_config[min(batch_size_config.keys(), key=lambda x: abs(int(x) - batch_size))]
+
+            return config
+        else:
+            config = {
+                "BLOCK_N": 16,
+                "num_warps": 2,
+                "num_stages": 2,
+            }
+        return config
+
+    @classmethod
+    def save_config(
+        cls,
+        gqa_group_size: int,
+        q_head_dim: int,
+        block_seq: int,
+        out_dtype: str,
+        config_json: Dict[int, Dict[int, Dict]],
+    ):
+        key_params = {
+            "gqa_group_size": gqa_group_size,
+            "q_head_dim": q_head_dim,
+            "block_seq": block_seq,
+            "out_dtype": str(out_dtype),
+        }
+        key_params = frozendict(key_params)
+
+        return cls.store_config(key_params, config_json)
 
 
 @triton.jit
-def _fwd_kernel_flash_decode_stage1(
+def _fwd_kernel_flash_decode_diverse_stage1(
     Q,
     stride_qbs,
     stride_qh,
@@ -160,6 +227,7 @@ def flash_decode_stage1(
     mid_out_logsumexp: torch.Tensor,
     block_seq: int,
     max_batch_group_size: int,
+    run_config: Optional[dict] = None,
 ):
     """
     该kernel是为多样性生成定制的gqa算子,其中 b_mark_shared_group 是一个shape 为 (batch_size,)的tensor,
@@ -169,9 +237,27 @@ def flash_decode_stage1(
     b_mark_shared_group 中每一个不为0的位置都代表其与前面多少个请求形成一个共享前缀组。属于
     同一个共享前缀组的请求, 其在对应的 b_shared_seq_len 中的内容必然相同。
     """
+    if not run_config:
+        if torch.cuda.is_current_stream_capturing():
+            avg_seq_len_in_batch = max_len_in_batch
+        else:
+            avg_seq_len_in_batch = max_len_in_batch
+
+        run_config = GQADiverseDecodeStage1KernelConfig.try_to_get_best_config(
+            batch_size=int(q.shape[0]),
+            avg_seq_len_in_batch=avg_seq_len_in_batch,
+            gqa_group_size=int(q.shape[1] // k.shape[1]),
+            q_head_dim=int(q.shape[2]),
+            block_seq=block_seq,
+            out_dtype=q.dtype,
+        )
+
+    BLOCK_N = run_config["BLOCK_N"]
+    num_warps = run_config["num_warps"]
+    num_stages = run_config["num_stages"]
+
     assert q.dim() == 3 and k.dim() == 3 and v.dim() == 3
     BLOCK_SEQ = block_seq
-    BLOCK_N = 16
     assert BLOCK_SEQ % BLOCK_N == 0
     # shape constraints
     Lq, Lk = q.shape[-1], k.shape[-1]
@@ -189,7 +275,7 @@ def flash_decode_stage1(
     if BLOCK_HEAD * BLOCK_BATCH < 16:
         BLOCK_BATCH = 16 // BLOCK_HEAD
 
-    _fwd_kernel_flash_decode_stage1[grid](
+    _fwd_kernel_flash_decode_diverse_stage1[grid](
         Q=q,
         stride_qbs=q.stride(0),
         stride_qh=q.stride(1),
@@ -227,7 +313,7 @@ def flash_decode_stage1(
         BLOCK_N=BLOCK_N,
         BLOCK_BATCH=BLOCK_BATCH,
         KV_QUANT_GROUP_SIZE=KV_QUANT_GROUP_SIZE,
-        num_warps=2,
-        num_stages=2,
+        num_warps=num_warps,
+        num_stages=num_stages,
     )
     return
diff --git a/test/kernel/llama_gqa_decode_vsm_tuning.py b/test/kernel/llama_gqa_decode_vsm_tuning.py
@@ -267,41 +267,38 @@ def tuning_configs(
     torch.multiprocessing.set_start_method("spawn")
 
     from lightllm.utils.tuning_utils import mp_tuning
-    from lightllm.models.deepseek2.triton_kernel.gqa_flash_decoding_config import MlaDecodeAttentionKernelConfig
-
     import collections
 
-    store_json_ans = collections.defaultdict(dict)
-
     def config_iter():
-        for q_head_num in [32]:
-            for q_head_dim in [64, 128]:
-                for group_size in [8, 16, 32]:
-                    for batch_size in [1, 8, 16, 32, 64, 128, 256]:
-                        for seq_len in [256, 512, 1024, 2048, 4096, 8192]:
-                            if batch_size * seq_len > 128 * 1024 * 4:
-                                continue
-                            yield q_head_num, q_head_dim, group_size, batch_size, seq_len
-
-    for q_head_num, q_head_dim, group_size, batch_size, seq_len in config_iter():
-
-        kv_head_num = q_head_num // group_size
-        ans = mp_tuning(
-            tuning_configs,
-            {
-                "q_shape": [batch_size, q_head_num, q_head_dim],
-                "kv_shape": [batch_size * seq_len, kv_head_num, q_head_dim],
-                "test_seq_len": seq_len,
-                "dtype": torch.half,
-                "test_count": 1,
-            },
-        )
-        store_json_ans[seq_len][batch_size] = ans
-
-        GQAVSMDecodeAttentionKernelConfig.save_config(
-            q_head_num=q_head_num,
-            q_head_dim=q_head_dim,
-            kv_head_num=kv_head_num,
-            out_dtype=str(torch.half),
-            config_json=store_json_ans,
-        )
+        for batch_size in [1, 8, 16, 32, 64, 128, 256]:
+            for seq_len in [256, 512, 1024, 2048, 4096, 8192]:
+                if batch_size * seq_len > 128 * 1024 * 4:
+                    continue
+                yield batch_size, seq_len
+
+    for q_head_num in [32]:
+        for q_head_dim in [64, 128]:
+            for group_size in [8, 16, 32]:
+                store_json_ans = collections.defaultdict(dict)
+                for batch_size, seq_len in config_iter():
+
+                    kv_head_num = q_head_num // group_size
+                    ans = mp_tuning(
+                        tuning_configs,
+                        {
+                            "q_shape": [batch_size, q_head_num, q_head_dim],
+                            "kv_shape": [batch_size * seq_len, kv_head_num, q_head_dim],
+                            "test_seq_len": seq_len,
+                            "dtype": torch.half,
+                            "test_count": 1,
+                        },
+                    )
+                    store_json_ans[seq_len][batch_size] = ans
+
+                    GQAVSMDecodeAttentionKernelConfig.save_config(
+                        q_head_num=q_head_num,
+                        q_head_dim=q_head_dim,
+                        kv_head_num=kv_head_num,
+                        out_dtype=str(torch.half),
+                        config_json=store_json_ans,
+                    )
diff --git a/test/kernel/llama_gqa_diverse_decode_stage1_tuning.py b/test/kernel/llama_gqa_diverse_decode_stage1_tuning.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"4096": {"8": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 9}, "32": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 4}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}}, "8192": {"8": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "32": {"BLOCK_N": 32, "num_warps": 8, "num_stages": 4}, "128": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "256": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 11}}}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"4096": {"8": {"BLOCK_N": 32, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 16, "num_warps": 16, "num_stages": 7}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "256": {"BLOCK_N": 32, "num_warps": 2, "num_stages": 11}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "32": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 10}, "128": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 9}, "256": {"BLOCK_N": 16, "num_warps": 4, "num_stages": 11}}}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"4096": {"8": {"BLOCK_N": 64, "num_warps": 16, "num_stages": 7}, "32": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 2}, "128": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 3}, "256": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 4}}, "8192": {"8": {"BLOCK_N": 16, "num_warps": 8, "num_stages": 2}, "32": {"BLOCK_N": 16, "num_warps": 2, "num_stages": 11}, "128": {"BLOCK_N": 64, "num_warps": 8, "num_stages": 10}, "256": {"BLOCK_N": 64, "num_warps": 2, "num_stages": 1}}}`