Merge branch 'ds_moe' of https://github.com/ModelTC/lightllm into ds_moe

shihaobai · shihaobai · commit 32651c7c983d · 2025-09-02T19:46:01.000+08:00
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -68,7 +68,7 @@ def _get_silu_and_mul_configs():
         {"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}
         for ns in [1, 2, 4]
         for nw in [1, 4, 8]
-        for bm in [32, 64, 128, 256]
+        for bm in [1, 8, 32, 64, 128, 256]
         for bn in [32, 64, 128, 256]
     ]
 
diff --git a/lightllm/common/quantization/deepgemm_quant.py b/lightllm/common/quantization/deepgemm_quant.py
@@ -66,5 +66,14 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
 
         if out is None:
             out = alloc_func((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        deep_gemm.gemm_fp8_fp8_bf16_nt([qinput_tensor, input_scale], [qweight.t(), weight_scale.t()], out)
+        _deepgemm_fp8_nt((qinput_tensor, input_scale), (qweight.t(), weight_scale.t()), out)
         return out
+
+
+def _deepgemm_fp8_nt(a_tuple, b_tuple, out):
+    if HAS_DEEPGEMM:
+        if hasattr(deep_gemm, "gemm_fp8_fp8_bf16_nt"):
+            return deep_gemm.gemm_fp8_fp8_bf16_nt([a_tuple[0], a_tuple[1]], [b_tuple[0], b_tuple[1]], out)
+        if hasattr(deep_gemm, "fp8_gemm_nt"):
+            return deep_gemm.fp8_gemm_nt((a_tuple[0], a_tuple[1]), (b_tuple[0], b_tuple[1]), out)
+    raise RuntimeError("deep_gemm does not provide fp8 NT GEMM kernel in this version")
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -192,7 +192,7 @@ def _try_load_cache(self, static_key):
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return
 
-    def _bench(self, *args, n_repeat=3, n_retries=5, **kwargs):
+    def _bench(self, *args, n_repeat=3, n_retries=3, **kwargs):
         from triton.compiler.errors import CompileTimeAssertionFailure
         from triton.runtime.errors import OutOfResources, PTXASError
 

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ def _get_silu_and_mul_configs():`
`68`	`68`	`{"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}`
`69`	`69`	`for ns in [1, 2, 4]`
`70`	`70`	`for nw in [1, 4, 8]`
`71`		`- for bm in [32, 64, 128, 256]`
	`71`	`+ for bm in [1, 8, 32, 64, 128, 256]`
`72`	`72`	`for bn in [32, 64, 128, 256]`
`73`	`73`	`]`
`74`	`74`