add deepseekv3 rotary config

hiworldwzj · hiworldwzj · commit 4046deb3a7e6 · 2025-08-17T12:51:18.000+08:00
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py
@@ -42,18 +42,18 @@ def _rotary_kernel(
         sin = tl.load(Sin + off_dimcos_sin)
 
         for q_head_index in tl.static_range(0, HEAD_Q, step=1):
-            off_q0 = (seq_index * stride_qbs + q_head_index * stride_qh + dim_range0 * stride_qd)
-            off_q1 = (seq_index * stride_qbs + q_head_index * stride_qh + dim_range1 * stride_qd)
+            off_q0 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range0 * stride_qd
+            off_q1 = seq_index * stride_qbs + q_head_index * stride_qh + dim_range1 * stride_qd
             q0 = tl.load(Q + off_q0)
             q1 = tl.load(Q + off_q1)
             out_q0 = q0 * cos - q1 * sin
             out_q1 = q0 * sin + q1 * cos
             tl.store(Q + off_q0, out_q0)
             tl.store(Q + off_q1, out_q1)
-        
+
         for k_head_index in tl.static_range(0, HEAD_K, step=1):
-            off_k0 = (seq_index * stride_kbs + k_head_index * stride_kh + dim_range0 * stride_kd)
-            off_k1 = (seq_index * stride_kbs + k_head_index * stride_kh + dim_range1 * stride_kd)
+            off_k0 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range0 * stride_kd
+            off_k1 = seq_index * stride_kbs + k_head_index * stride_kh + dim_range1 * stride_kd
 
             k0 = tl.load(K + off_k0)
             k1 = tl.load(K + off_k1)
@@ -67,21 +67,28 @@ def _rotary_kernel(
 
 
 @torch.no_grad()
-def rotary_emb_fwd(q, k, cos, sin):
+def rotary_emb_fwd(q, k, cos, sin, **run_config):
     total_len = q.shape[0]
     head_num_q, head_num_k = q.shape[1], k.shape[1]
     head_dim = q.shape[2]
     assert q.shape[0] == cos.shape[0] and q.shape[0] == sin.shape[0], f"q shape {q.shape} cos shape {cos.shape}"
     assert k.shape[0] == cos.shape[0] and k.shape[0] == sin.shape[0], f"k shape {k.shape} cos shape {cos.shape}"
     assert triton.next_power_of_2(head_dim) == head_dim
 
-    if total_len <= 512:
-        BLOCK_SEQ = 1
-    else:
-        BLOCK_SEQ = 16
+    from .rotary_emb_config import DeepseekV3RotaryKernelConfig
+
+    if not run_config:
+        run_config = DeepseekV3RotaryKernelConfig.try_to_get_best_config(
+            M=total_len,
+            Q_HEAD_NUM=head_num_q,
+            K_HEAD_NUM=head_num_k,
+            HEAD_DIM=head_dim,
+            out_dtype=str(q.dtype),
+        )
 
-    num_warps = 1
-    num_stages = 3
+    BLOCK_SEQ = run_config["BLOCK_SEQ"]
+    num_warps = run_config["num_warps"]
+    num_stages = run_config["num_stages"]
 
     grid = (triton.cdiv(total_len, BLOCK_SEQ),)
     _rotary_kernel[grid](
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb_config.py
@@ -0,0 +1,63 @@
+import os
+from frozendict import frozendict
+from functools import lru_cache
+from lightllm.common.kernel_config import KernelConfigs
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+class DeepseekV3RotaryKernelConfig(KernelConfigs):
+    kernel_name: str = "deepseek_v3_rotary_emb_kernel"
+
+    @classmethod
+    @lru_cache(maxsize=200)
+    def try_to_get_best_config(
+        cls,
+        M: int,
+        Q_HEAD_NUM: int,
+        K_HEAD_NUM: int,
+        HEAD_DIM: int,
+        dtype: str,
+    ) -> dict:
+        key_params = {
+            "M": M,
+            "Q_HEAD_NUM": Q_HEAD_NUM,
+            "K_HEAD_NUM": K_HEAD_NUM,
+            "HEAD_DIM": HEAD_DIM,
+            "dtype": str(dtype),
+        }
+        key_params = frozendict(key_params)
+
+        finded_config = cls.get_the_config(key_params)
+
+        if finded_config:
+            config = finded_config[min(finded_config.keys(), key=lambda x: abs(int(x) - M))]
+            return config
+        else:
+            if M <= 256:
+                config = {"BLOCK_SEQ": 1, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1}
+            else:
+                config = {"BLOCK_SEQ": 16, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1}
+        return config
+
+    @classmethod
+    def save_config(
+        cls,
+        M: int,
+        Q_HEAD_NUM: int,
+        K_HEAD_NUM: int,
+        HEAD_DIM: int,
+        dtype: str,
+        config_json: dict,
+    ):
+        key_params = {
+            "M": M,
+            "Q_HEAD_NUM": Q_HEAD_NUM,
+            "K_HEAD_NUM": K_HEAD_NUM,
+            "HEAD_DIM": HEAD_DIM,
+            "dtype": str(dtype),
+        }
+        key_params = frozendict(key_params)
+
+        return cls.store_config(key_params, config_json)
diff --git a/lightllm/utils/tuning_utils.py b/lightllm/utils/tuning_utils.py
@@ -54,7 +54,7 @@ def mp_tuning(func, args: Dict[str, Any]):
                 best_cost_time = _cost_time
                 best_config = _config
 
-    logger.info(f"best config {best_config} best cost time {best_cost_time}")
+    logger.info(f"args: {args} best config {best_config} best cost time {best_cost_time}")
     return best_config
 
 
diff --git a/test/kernel/deepseekv3_rotary_emb_tuning.py b/test/kernel/deepseekv3_rotary_emb_tuning.py
@@ -0,0 +1,242 @@
+import os
+import torch
+import time
+import torch.multiprocessing as mp
+import itertools
+from lightllm.models.deepseek2.triton_kernel.rotary_emb import rotary_emb_fwd
+from lightllm.models.deepseek2.triton_kernel.rotary_emb_config import DeepseekV3RotaryKernelConfig
+from lightllm.utils.watchdog_utils import Watchdog
+from typing import List
+from lightllm.utils.log_utils import init_logger
+
+logger = init_logger(__name__)
+
+
+def set_seed():
+    import torch
+    import random
+    import numpy as np
+
+    seed = 42
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+    return
+
+
+@torch.no_grad()
+def test_kernel(
+    M: int,
+    Q_HEAD_NUM: int,
+    K_HEAD_NUM: int,
+    HEAD_DIM: int,
+    dtype: torch.dtype,
+    test_count: int,
+    **config,
+):
+    set_seed()
+    input_tuples = []
+
+    q = torch.randn((M, Q_HEAD_NUM, HEAD_DIM), device="cuda", dtype=dtype) / 10
+    k = torch.randn((M, K_HEAD_NUM, HEAD_DIM), device="cuda", dtype=dtype) / 10
+    cos = torch.randn((M, HEAD_DIM // 2), device="cuda", dtype=dtype)
+    sin = torch.randn((M, HEAD_DIM // 2), device="cuda", dtype=dtype)
+
+    for _ in range(test_count):
+        input_tuples.append((q.clone(), k.clone(), cos.clone(), sin.clone()))
+
+    # warm_up
+    rotary_emb_fwd(q=q, k=k, cos=cos, sin=sin, **config)
+
+    graph = torch.cuda.CUDAGraph()
+
+    with torch.cuda.graph(graph):
+        for index in range(test_count):
+            q, k, cos, sin = input_tuples[index]
+            rotary_emb_fwd(q=q, k=k, cos=cos, sin=sin, **config)
+
+    graph.replay()
+
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    graph.replay()
+    end_event.record()
+    end_event.synchronize()
+
+    cost_time = start_event.elapsed_time(end_event)
+
+    logger.info(str(config))
+    logger.info(f"bf16 {M} cost time: {cost_time} ms")
+    return cost_time
+
+
+def worker(
+    M: int,
+    Q_HEAD_NUM: int,
+    K_HEAD_NUM: int,
+    HEAD_DIM: int,
+    dtype: torch.dtype,
+    test_count: int,
+    test_configs,
+    queue,
+):
+    dog = Watchdog(timeout=10)
+    dog.start()
+    try:
+        for index in range(len(test_configs)):
+            cost_time = test_kernel(
+                M=M,
+                Q_HEAD_NUM=Q_HEAD_NUM,
+                K_HEAD_NUM=K_HEAD_NUM,
+                HEAD_DIM=HEAD_DIM,
+                dtype=dtype,
+                test_count=test_count,
+                **test_configs[index],
+            )
+            dog.heartbeat()
+            queue.put(cost_time)  # Put result in queue
+
+    except Exception as ex:
+        logger.error(str(ex))
+        logger.exception(str(ex))
+        import sys
+
+        sys.exit(-1)
+        pass
+
+
+def get_test_configs(split_id, split_count):
+    index = 0
+    result = itertools.product([1, 2, 4, 8, 16, 32], [1, 2, 4, 8], [1, 2, 3, 4, 5])
+    for BLOCK_SEQ, num_warps, num_stages in result:
+        t_config = {
+            "BLOCK_SEQ": BLOCK_SEQ,
+            "num_warps": num_warps,
+            "num_stages": num_stages,
+        }
+        if index % split_count == split_id:
+            yield t_config
+            index += 1
+        else:
+            index += 1
+
+
+def tuning_configs(
+    device_id: int,  # use for mult mp tunning
+    device_count: int,
+    M: int,
+    Q_HEAD_NUM: int,
+    K_HEAD_NUM: int,
+    HEAD_DIM: int,
+    dtype: torch.dtype,
+    test_count: int,
+):
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
+    best_config, best_cost_time = None, 10000000
+    queue = mp.Queue()
+    test_configs = []
+    for t_config in get_test_configs(device_id, device_count):
+        test_configs.append(t_config)
+        if len(test_configs) < 256:
+            continue
+
+        p = mp.Process(
+            target=worker,
+            args=(
+                M,
+                Q_HEAD_NUM,
+                K_HEAD_NUM,
+                HEAD_DIM,
+                dtype,
+                test_count,
+                test_configs,
+                queue,
+            ),
+        )
+        p.start()
+        p.join()
+        while len(test_configs) != 0:
+            try:
+                cost_time = queue.get_nowait()
+                logger.info(f"get {test_configs[0]} cost_time: {cost_time}")
+                if cost_time < best_cost_time:
+                    best_config = test_configs[0]
+                    best_cost_time = cost_time
+                    logger.info(f"cur best : {best_config} {best_cost_time}")
+                del test_configs[0:1]
+            except:
+                del test_configs[0:16]
+                logger.info(f"cur best : {best_config} {best_cost_time}")
+                break
+
+    while len(test_configs) != 0:
+        p = mp.Process(
+            target=worker,
+            args=(
+                M,
+                Q_HEAD_NUM,
+                K_HEAD_NUM,
+                HEAD_DIM,
+                dtype,
+                test_count,
+                test_configs,
+                queue,
+            ),
+        )
+        p.start()
+        p.join()
+
+        while len(test_configs) != 0:
+            try:
+                cost_time = queue.get_nowait()
+                logger.info(f"get {test_configs[0]} cost_time: {cost_time}")
+                if cost_time < best_cost_time:
+                    best_config = test_configs[0]
+                    best_cost_time = cost_time
+                    logger.info(f"cur best : {best_config} {best_cost_time}")
+                del test_configs[0:1]
+            except:
+                del test_configs[0:16]
+                logger.info(f"cur best : {best_config} {best_cost_time}")
+                break
+
+    logger.info(f"M {M} {best_config} best cost: {best_cost_time}")
+    return best_config, best_cost_time
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")
+    from lightllm.utils.tuning_utils import mp_tuning
+
+    # for deepseekv3 600B
+    q_head_num = 128
+    k_head_num = 1
+    head_dim = 64
+    dtype = torch.bfloat16
+    for m in [1, 128, 256, 1024, 2048, 4096, 8192]:
+        json_dict = {}
+        ans = mp_tuning(
+            tuning_configs,
+            {
+                "M": m,
+                "Q_HEAD_NUM": q_head_num,
+                "K_HEAD_NUM": k_head_num,
+                "HEAD_DIM": head_dim,
+                "dtype": dtype,
+                "test_count": 20,
+            },
+        )
+        json_dict[m] = ans
+        DeepseekV3RotaryKernelConfig.save_config(
+            M=m,
+            Q_HEAD_NUM=q_head_num,
+            K_HEAD_NUM=k_head_num,
+            HEAD_DIM=head_dim,
+            dtype=str(dtype),
+            config_json=json_dict,
+        )