fix

sufubao · sufubao · commit 25b877e6cf61 · 2025-08-26T16:17:36.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -471,9 +471,8 @@ def _get_grouped_matmul_static_key(
     )
 
 
-@autotune(
-    name="grouped_matmul:v1",
-    configs=[
+def _get_grouped_matmul_configs():
+    return [
         {
             "BLOCK_SIZE_M": bm,
             "BLOCK_SIZE_N": bn,
@@ -488,7 +487,12 @@ def _get_grouped_matmul_static_key(
         for bm in [16, 32, 64, 128]
         for bn in [16, 32, 64, 128]
         for bk in [16, 32, 64, 128]
-    ],
+    ]
+
+
+@autotune(
+    name="grouped_matmul:v1",
+    configs=_get_grouped_matmul_configs,
     static_key_func=_get_grouped_matmul_static_key,
     run_key_func=lambda token_num_mul_topk_num: str(nearest_power_of_2(token_num_mul_topk_num)),
 )
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -63,15 +63,19 @@ def _silu_and_mul_kernel_fast(
         )
 
 
-@autotune(
-    name="silu_and_mul_fwd:v1",
-    configs=[
+def _get_silu_and_mul_configs():
+    return [
         {"BLOCK_M": bm, "BLOCK_N": bn, "num_warps": nw, "NUM_STAGES": ns}
         for ns in [1, 2, 4]
         for nw in [1, 4, 8]
         for bm in [32, 64, 128, 256]
         for bn in [32, 64, 128, 256]
-    ],
+    ]
+
+
+@autotune(
+    name="silu_and_mul_fwd:v1",
+    configs=_get_silu_and_mul_configs,
     static_key_func=lambda input, output: f"N={input.shape[-1] // 2},out_dtype={output.dtype}",
     run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),
 )
diff --git a/lightllm/common/fused_moe/moe_sum_reduce.py b/lightllm/common/fused_moe/moe_sum_reduce.py
@@ -52,15 +52,19 @@ def _get_static_key(input, output):
     return f"topk_num={input.shape[1]},hidden_dim={input.shape[2]},out_dtype={output.dtype}"
 
 
-@autotune(
-    name="moe_sum_reduce:v1",
-    configs=[
+def _get_moe_sum_reduce_configs():
+    return [
         {"BLOCK_M": bm, "BLOCK_DIM": bd, "NUM_STAGE": ns, "num_warps": nw}
         for ns in [1, 2, 4]
         for nw in [1, 2, 4, 8, 16]
         for bm in [1, 2, 4, 8, 16, 32]
         for bd in [64, 128, 256, 512, 1024]
-    ],
+    ]
+
+
+@autotune(
+    name="moe_sum_reduce:v1",
+    configs=_get_moe_sum_reduce_configs,
     static_key_func=_get_static_key,
     run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),
 )
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py
@@ -178,7 +178,7 @@ def _get_static_key(A, B, block_size, dtype):
 
 @autotune(
     name="w8a8_block_fp8_matmul:v1",
-    configs=get_test_configs(),
+    configs=get_test_configs,
     static_key_func=_get_static_key,
     run_key_func=lambda M: str(nearest_power_of_2(M)),
 )
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -13,6 +13,7 @@
 from lightllm.utils.log_utils import init_logger
 import traceback
 from typing import Callable, Optional, Union, List
+from lightllm.utils.envs_utils import is_triton_autotune_enabled, disable_triton_autotune
 
 logger = init_logger(__name__)
 
@@ -45,16 +46,6 @@ def decorator(fn):
     return decorator
 
 
-def is_triton_autotune_enabled():
-    # Whether Triton autotune is enabled (read-only check)
-    return os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1"
-
-
-def disable_triton_autotune():
-    # Disable Triton autotune (setter)
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "0"
-
-
 class Autotuner:
     @staticmethod
     def _get_param_names(func):
@@ -148,8 +139,6 @@ def _post_hook(kwargs, exception):
                 os.makedirs(self.cache_dir, exist_ok=True)
 
         self._loaded_static_keys = set()
-        self.sorted_cached_configs = {}
-        self.early_stop_cnt = 0
 
     @lru_cache(maxsize=None)
     def _ensure_cache_loaded(self, static_key: str):
@@ -160,15 +149,11 @@ def _ensure_cache_loaded(self, static_key: str):
             try:
                 with open(cache_file, "rb") as f:
                     self.cached_configs[static_key] = orjson.loads(f.read())
-                    self.sorted_cached_configs[static_key] = [
-                        (int(k), v) for k, v in self.cached_configs[static_key].items()
-                    ]
-                    self.sorted_cached_configs[static_key].sort(key=lambda x: x[0])
             except Exception:
                 self.cached_configs[static_key] = {}
         self._loaded_static_keys.add(static_key)
 
-    def _bench(self, *args, n_repeat=5, n_retries=1, current_best_ms=None, **kwargs):
+    def _bench(self, *args, n_repeat=5, n_retries=1, **kwargs):
         from triton.compiler.errors import CompileTimeAssertionFailure
         from triton.runtime.errors import OutOfResources, PTXASError
 
@@ -234,26 +219,25 @@ def _benchmark(_run_key):
             rank_id = get_global_rank()
             _best_config = self.default_config
             best_time = float("inf")
-            self.early_stop_cnt = 0
+
             bar = tqdm(
                 self.configs,
-                desc=f"Autotuning {self.name} for {_run_key}, es:{self.early_stop_cnt / len(self.configs):.2%}",
+                desc=f"Autotuning {self.name} for {_run_key}",
                 position=get_global_rank(),
                 dynamic_ncols=True,
             )
             enum_configs = enumerate(bar)
             for i, config in enum_configs:
                 kwargs_with_config = kwargs.copy()
                 kwargs_with_config["run_config"] = config
-                run_time = self._bench(*args, current_best_ms=best_time, **kwargs_with_config)
+                run_time = self._bench(*args, **kwargs_with_config)
                 if run_time < best_time:
                     best_time = run_time
                     _best_config = config
                 bar.set_description(
-                    f"Autotuning {self.name} [rank:{rank_id}] \
-                        for {_run_key}, es:{self.early_stop_cnt / len(self.configs):.2%}, \
-                        best_time: {best_time:.5f}"
+                    f"Autotuning {self.name} [rank:{rank_id}] for {_run_key}, best_time: {best_time:.5f}"
                 )
+
             world_size = dist.get_world_size() if dist.is_initialized() else 1
             if world_size > 1:
                 local_best = torch.tensor([best_time], device="cuda")
@@ -268,24 +252,21 @@ def _benchmark(_run_key):
             if static_key not in self.cached_configs:
                 self.cached_configs[static_key] = {}
             self.cached_configs[static_key][run_key] = _best_config
-            self.sorted_cached_configs[static_key] = [(int(k), v) for k, v in self.cached_configs[static_key].items()]
-            self.sorted_cached_configs[static_key].sort(key=lambda x: x[0])
 
+            # save configs to file
             if not dist.is_initialized() or get_global_rank() == 0:
-                if os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1":
-                    cache_file = os.path.join(self.cache_dir, f"{static_key}.json")
-                    with open(cache_file, "wb") as f:
-                        fcntl.flock(f, fcntl.LOCK_EX)
-                        try:
-                            f.write(
-                                orjson.dumps(
-                                    self.cached_configs[static_key], option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS
-                                )
+                cache_file = os.path.join(self.cache_dir, f"{static_key}.json")
+                with open(cache_file, "wb") as f:
+                    fcntl.flock(f, fcntl.LOCK_EX)
+                    try:
+                        f.write(
+                            orjson.dumps(
+                                self.cached_configs[static_key], option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS
                             )
-                        finally:
-                            fcntl.flock(f, fcntl.LOCK_UN)
-                    if self.print_autotune:
-                        logger.info(f"Saved configs for {self.name} - {static_key} - {run_key}")
+                        )
+                    finally:
+                        fcntl.flock(f, fcntl.LOCK_UN)
+                logger.info(f"Saved configs for {self.name} - {static_key} - {run_key}")
 
             kwargs["run_config"] = self.cached_configs[static_key][run_key]
             full_nargs = {**self.nargs, **kwargs}
@@ -294,7 +275,8 @@ def _benchmark(_run_key):
         if static_key not in self.cached_configs:
             if not is_triton_autotune_enabled():
                 logger.warning(
-                    f"No kernel config for {self.name} in {self.cache_dir}/{static_key}, using default config",
+                    f"No kernel config for {self.name} - {static_key}, \
+                    using default config. Use `LIGHTLLM_TRITON_AUTOTUNE=1` to enable autotune.",
                 )
             self.cached_configs[static_key] = {}
 
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py
@@ -120,10 +120,11 @@ def get_static_key(q, k):
 
 @autotune(
     name="rotary_emb_fwd:v1",
-    configs=get_test_configs(),
+    configs=get_test_configs,
     static_key_func=get_static_key,
     run_key_func=lambda q: str(nearest_power_of_2(q.shape[0])),
 )
+@torch.no_grad()
 def rotary_emb_fwd(q, k, cos, sin, run_config=None):
     total_len = q.shape[0]
     head_num_q, head_num_k = q.shape[1], k.shape[1]
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -149,6 +149,16 @@ def get_kv_quant_calibration_inference_count():
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_INFERENCE_COUNT", 4000))
 
 
+def is_triton_autotune_enabled():
+    # Whether Triton autotune is enabled (read-only check)
+    return os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1"
+
+
+def disable_triton_autotune():
+    # Disable Triton autotune (setter)
+    os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "0"
+
+
 g_model_init_done = False
 
 

Original file line number	Diff line number	Diff line change
`@@ -178,7 +178,7 @@ def _get_static_key(A, B, block_size, dtype):`
`178`	`178`
`179`	`179`	`@autotune(`
`180`	`180`	`name="w8a8_block_fp8_matmul:v1",`
`181`		`- configs=get_test_configs(),`
	`181`	`+ configs=get_test_configs,`
`182`	`182`	`static_key_func=_get_static_key,`
`183`	`183`	`run_key_func=lambda M: str(nearest_power_of_2(M)),`
`184`	`184`	`)`