fix

sufubao · sufubao · commit 0a79f6e1deae · 2025-08-26T11:57:42.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -450,7 +450,7 @@ def grouped_matmul_kernel(
     return
 
 
-def get_grouped_matmul_static_key(
+def _get_grouped_matmul_static_key(
     expert_weights: torch.Tensor,
     topk_num: int,
     out: torch.Tensor,
@@ -489,7 +489,7 @@ def get_grouped_matmul_static_key(
         for bn in [16, 32, 64, 128]
         for bk in [16, 32, 64, 128]
     ],
-    static_key_func=get_grouped_matmul_static_key,
+    static_key_func=_get_grouped_matmul_static_key,
     run_key_func=lambda token_num_mul_topk_num: str(nearest_power_of_2(token_num_mul_topk_num)),
 )
 def grouped_matmul(
diff --git a/lightllm/common/fused_moe/moe_sum_reduce.py b/lightllm/common/fused_moe/moe_sum_reduce.py
@@ -48,7 +48,7 @@ def _moe_sum_reduce_kernel(
         tl.store(store_t_ptr, accumulator.to(input_ptr.dtype.element_ty), mask=offs_dim < dim_end)
 
 
-def get_static_key(input, output):
+def _get_static_key(input, output):
     return f"topk_num={input.shape[1]},hidden_dim={input.shape[2]},out_dtype={output.dtype}"
 
 
@@ -61,7 +61,7 @@ def get_static_key(input, output):
         for bm in [1, 2, 4, 8, 16, 32]
         for bd in [64, 128, 256, 512, 1024]
     ],
-    static_key_func=get_static_key,
+    static_key_func=_get_static_key,
     run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),
 )
 def moe_sum_reduce(input: torch.Tensor, output: torch.Tensor, run_config: Dict = None):
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py
@@ -165,7 +165,7 @@ def get_test_configs():
     return fp8_gemm_configs
 
 
-def get_static_key(A, B, block_size, dtype):
+def _get_static_key(A, B, block_size, dtype):
     M, K = A.shape
     _, N = B.shape
     return {
@@ -179,7 +179,7 @@ def get_static_key(A, B, block_size, dtype):
 @autotune(
     name="w8a8_block_fp8_matmul:v1",
     configs=get_test_configs(),
-    static_key_func=get_static_key,
+    static_key_func=_get_static_key,
     run_key_func=lambda M: str(nearest_power_of_2(M)),
 )
 def w8a8_block_fp8_matmul(
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -89,8 +89,6 @@ def __init__(
         warmup=None,
         rep=None,
     ):
-        # Whether to print autotune logs
-        self.print_autotune = os.environ.get("LIGHTLLM_TRITON_PRINT_AUTOTUNE", "0") == "1"
         # Whether to use this autotune decorator
         self.disable_autotune = os.environ.get("DISABLE_AUTOTUNE_DECORATOR", "0") == "1"
 
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py
@@ -132,6 +132,17 @@ def rotary_emb_fwd(q, k, cos, sin, run_config=None):
     assert k.shape[0] == cos.shape[0] and k.shape[0] == sin.shape[0], f"k shape {k.shape} cos shape {cos.shape}"
     assert triton.next_power_of_2(head_dim) == head_dim
 
+    from .rotary_emb_config import DeepseekV3RotaryKernelConfig
+
+    if not run_config:
+        run_config = DeepseekV3RotaryKernelConfig.try_to_get_best_config(
+            M=total_len,
+            Q_HEAD_NUM=head_num_q,
+            K_HEAD_NUM=head_num_k,
+            HEAD_DIM=head_dim,
+            dtype=str(q.dtype),
+        )
+
     BLOCK_SEQ = run_config["BLOCK_SEQ"]
     HEAD_PARALLEL_NUM = run_config["HEAD_PARALLEL_NUM"]
     num_warps = run_config["num_warps"]