ModelTC
diff --git a/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 1 deletion b/‎lightllm/common/basemodel/basemodel.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 22 additions & 10 deletions b/‎lightllm/common/fused_moe/grouped_fused_moe.py‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 7 additions & 1 deletion b/‎lightllm/common/fused_moe/moe_silu_and_mul.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎lightllm/common/fused_moe/moe_sum_reduce.py‎
Lines changed: 5 additions & 0 deletions b/‎lightllm/common/fused_moe/moe_sum_reduce.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=False.json‎
Lines changed: 0 additions & 122 deletions b/‎lightllm/common/triton_utils/all_kernel_configs/triton_3.3.1/NVIDIA H200/grouped_matmul:v1/K=192,N=4096,expert_num=128,mul_routed_weight=True,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=False.json‎
Lines changed: 0 additions & 122 deletions
@@ -723,6 +723,7 @@ def _check_max_len_infer(self):
         return
 
     def autotune_layers(self):
+        # 控制autotune的层数，用于适配不同模型
         return self.config.get("first_k_dense_replace", 0) + 1
 
     @final
@@ -749,7 +750,7 @@ def _autotune_warmup(self):
         for input_len in warmup_lengths:
             try:
                 logger.info(f"autotune warmup for length {input_len}")
-                dummy_input_ids = torch.ones(input_len, dtype=torch.int32, device="cuda")
+                dummy_input_ids = torch.randint(0, 10000, (input_len,), dtype=torch.int32, device="cuda")
                 b_req_idx = torch.tensor([self.req_manager.alloc()], dtype=torch.int32, device="cuda")
                 mem_indexes = self.mem_manager.alloc(len(dummy_input_ids)).cuda()
                 b_seq_len = torch.ones(1, dtype=torch.int32, device="cuda")
@@ -777,11 +778,13 @@ def _autotune_warmup(self):
                 del model_output
                 self.req_manager.free_all()
                 self.mem_manager.free_all()
+                torch.cuda.empty_cache()
                 logger.info(f"autotune warmup for length {input_len} ok")
             except Exception as e:
                 logger.warning(f"autotune warmup for length {input_len} failed: {str(e)}")
                 self.req_manager.free_all()
                 self.mem_manager.free_all()
+                torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
         os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "0"
 
@@ -30,6 +30,7 @@
     get_device_sm_shared_mem_num,
     get_device_warp_size,
 )
+from .moe_kernel_configs import MoeGroupedGemmKernelConfig
 from .moe_silu_and_mul import silu_and_mul_fwd
 from .moe_sum_reduce import moe_sum_reduce
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
@@ -117,7 +118,7 @@ def moe_align1_kernel(
     experts_topk_weight_stride0,
     experts_topk_weight_stride1,
     TOKEN_BLOCK_SIZE: tl.constexpr,
-    NUM_STAGE: tl.constexpr,
+    num_stages: tl.constexpr,
 ):
 
     expert_id = tl.program_id(axis=0)
@@ -126,7 +127,7 @@ def moe_align1_kernel(
 
     pre_sum = 0
 
-    for start_loc in tl.range(0, experts_info_n, TOKEN_BLOCK_SIZE, num_stages=NUM_STAGE):
+    for start_loc in tl.range(0, experts_info_n, TOKEN_BLOCK_SIZE, num_stages=num_stages):
         n_range = start_loc + off_n
         topk_weights_data = tl.load(topk_weights + n_range, mask=n_range < experts_info_n, other=0)
         expert_data = tl.load(
@@ -212,7 +213,7 @@ def moe_align1(
         experts_weight_info.stride(0),
         experts_weight_info.stride(1),
         TOKEN_BLOCK_SIZE=TOKEN_BLOCK_SIZE,
-        NUM_STAGE=4,
+        num_stages=4,
         num_warps=8,
         num_stages=1,
     )
@@ -478,8 +479,8 @@ def get_grouped_matmul_static_key(
             "BLOCK_SIZE_N": bn,
             "BLOCK_SIZE_K": bk,
             "GROUP_SIZE_M": gm,
-            "NUM_WARPS": nw,
-            "NUM_STAGE": ns,
+            "num_warps": nw,
+            "num_stages": ns,
         }
         for ns in [1, 2, 3, 4, 5]
         for gm in [1, 2, 4, 8]
@@ -493,8 +494,8 @@ def get_grouped_matmul_static_key(
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 32,
         "GROUP_SIZE_M": 8,
-        "NUM_WARPS": 4,
-        "NUM_STAGE": 1,
+        "num_warps": 4,
+        "num_stages": 1,
     },
     static_key_func=get_grouped_matmul_static_key,
     run_key_func=lambda token_num_mul_topk_num: str(nearest_power_of_2(token_num_mul_topk_num)),
@@ -536,7 +537,6 @@ def grouped_matmul(
     assert expert_to_token_num.is_contiguous()
     assert expert_to_weights.is_contiguous()
     assert expert_weights.is_contiguous()
-    assert run_config is not None
 
     # for deepseek_v3 block-wise quant
     block_size_n = 0
@@ -546,12 +546,24 @@ def grouped_matmul(
             block_size_n = expert_weights.shape[1] // expert_to_weights_scale.shape[1]
             block_size_k = expert_weights.shape[2] // expert_to_weights_scale.shape[2]
 
+    if run_config is None:
+        run_config = MoeGroupedGemmKernelConfig.try_to_get_best_config(
+            M=token_inputs.shape[0],
+            N=n,
+            K=k,
+            topk_num=topk_num,
+            expert_num=expert_num,
+            mul_routed_weight=mul_routed_weight,
+            use_fp8_w8a8=use_fp8_w8a8,
+            out_dtype=str(out.dtype),
+        )
+
     BLOCK_SIZE_M = run_config["BLOCK_SIZE_M"]
     BLOCK_SIZE_N = run_config["BLOCK_SIZE_N"]
     BLOCK_SIZE_K = run_config["BLOCK_SIZE_K"]
     GROUP_SIZE_M = run_config["GROUP_SIZE_M"]
-    num_warps = run_config["NUM_WARPS"]
-    num_stages = run_config["NUM_STAGE"]
+    num_warps = run_config["num_warps"]
+    num_stages = run_config["num_stages"]
 
     if block_size_k != 0:
         # 如果使用了 block wise 量化，分块大小不能超过 block size
 
@@ -79,18 +79,24 @@ def _silu_and_mul_kernel_fast(
 def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, run_config=None):
     assert input.is_contiguous()
     assert output.is_contiguous()
-    assert run_config is not None
+
     stride_input_m = input.stride(0)
     stride_input_n = input.stride(1)
     stride_output_m = output.stride(0)
     stride_output_n = output.stride(1)
     size_m = input.shape[0]
     size_n = input.shape[-1] // 2
 
+    if not run_config:
+        run_config = MoeSiluAndMulKernelConfig.try_to_get_best_config(M=size_m, N=size_n, out_dtype=str(output.dtype))
+
     BLOCK_M = run_config["BLOCK_M"]
     BLOCK_N = run_config["BLOCK_N"]
     num_warps = run_config["num_warps"]
     NUM_STAGES = run_config["NUM_STAGES"]
+    # limit the grid size to avoid the invalid argument error of triton
+    while triton.cdiv(size_m, BLOCK_M) > 8192:
+        BLOCK_M *= 2
 
     grid = (
         triton.cdiv(size_n, BLOCK_N),
 
@@ -72,6 +72,11 @@ def moe_sum_reduce(input: torch.Tensor, output: torch.Tensor, run_config: Dict =
     token_num, topk_num, hidden_dim = input.shape
     assert output.shape[0] == token_num and output.shape[1] == hidden_dim
 
+    if not run_config:
+        run_config = MoeSumReduceKernelConfig.try_to_get_best_config(
+            M=token_num, topk_num=topk_num, hidden_dim=hidden_dim, out_dtype=str(output.dtype)
+        )
+
     BLOCK_M = run_config["BLOCK_M"]
     BLOCK_DIM = run_config["BLOCK_DIM"]
     NUM_STAGE = run_config["NUM_STAGE"]