force to warmup triton autotune configs in start. (#1043)

sufubao · hiworldwzj · web-flow · commit 64756173201c · 2025-09-05T17:16:00.000+08:00
Co-authored-by: hiworldwzj &lt;30762946+hiworldwzj@users.noreply.github.com&gt;
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn.functional as F
 from typing import final
+from tqdm import tqdm
 
 from lightllm.common.basemodel.layer_weights.hf_load_utils import load_hf_weights
 from lightllm.common.basemodel.infer_struct import InferStateInfo
@@ -26,7 +27,8 @@
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, get_triton_autotune_level
+from lightllm.utils.envs_utils import set_model_init_status
+from lightllm.common.triton_utils.autotuner import Autotuner
 from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
@@ -732,9 +734,7 @@ def autotune_layers(self):
     @torch.no_grad()
     @post_empty_cache
     def _autotune_warmup(self):
-        if get_triton_autotune_level() not in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
-            return
-
+        Autotuner.start_autotune_warmup()
         torch.distributed.barrier()
 
         warmup_lengths = [1, 8, 16, 32, 64, 100, 128, 256, 1024, 2048, 4096]
@@ -748,9 +748,8 @@ def _autotune_warmup(self):
 
         layer_num_bak = self.layers_num
         self.layers_num = self.autotune_layers()
-        for input_len in warmup_lengths:
+        for input_len in tqdm(warmup_lengths, desc="warming up"):
             try:
-                logger.info(f"autotune warmup for length {input_len}")
                 rand_gen = torch.Generator(device="cuda")
                 rand_gen.manual_seed(input_len)
                 dummy_input_ids = torch.randint(
@@ -785,7 +784,6 @@ def _autotune_warmup(self):
                 self.mem_manager.free_all()
                 gc.collect()
                 torch.cuda.empty_cache()
-                logger.info(f"autotune warmup for length {input_len} ok")
             except Exception as e:
                 logger.warning(f"autotune warmup for length {input_len} failed: {str(e)}")
                 logger.exception(str(e))
@@ -795,7 +793,7 @@ def _autotune_warmup(self):
                 torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        set_triton_autotune_level(AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG)
+        Autotuner.end_autotune_warmup()
 
     @final
     @torch.no_grad()
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py b/lightllm/common/basemodel/layer_weights/meta_weights/fused_moe_weight_ep.py
@@ -21,9 +21,9 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.common.basemodel.triton_kernel.redundancy_topk_ids_repair import redundancy_topk_ids_repair
-from lightllm.utils.envs_utils import get_triton_autotune_level
-from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.log_utils import init_logger
+from lightllm.common.triton_utils.autotuner import Autotuner
+
 
 logger = init_logger(__name__)
 
@@ -356,7 +356,7 @@ def prefilled_group_gemm(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+            if Autotuner.is_autotune_warmup():
                 _gemm_out_a = torch.zeros((1, N), device=device, dtype=hidden_dtype)
                 _silu_out = torch.zeros((1, N // 2), device=device, dtype=hidden_dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
diff --git a/lightllm/common/fused_moe/grouped_fused_moe_ep.py b/lightllm/common/fused_moe/grouped_fused_moe_ep.py
@@ -14,8 +14,7 @@
 )
 from lightllm.common.fused_moe.deepep_scatter_gather import ep_scatter, ep_gather
 from lightllm.utils.envs_utils import get_deepep_num_max_dispatch_tokens_per_rank
-from lightllm.utils.envs_utils import get_triton_autotune_level
-from lightllm.common.triton_utils.autotuner import AutotuneLevel
+from lightllm.common.triton_utils.autotuner import Autotuner
 import numpy as np
 
 logger = init_logger(__name__)
@@ -190,7 +189,7 @@ def fused_experts_impl(
             ######################################## warning ##################################################
             # here is used to match autotune feature, make moe model run same triton kernel in different rank.
             # in some special case, one rank will recv 0 token, so add a token to make it run triton kernel.
-            if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+            if Autotuner.is_autotune_warmup():
                 _gemm_out_a = torch.zeros((1, N), device=hidden_states.device, dtype=hidden_states.dtype)
                 _silu_out = torch.zeros((1, N // 2), device=hidden_states.device, dtype=hidden_states.dtype)
                 silu_and_mul_fwd(_gemm_out_a.view(-1, N), _silu_out)
diff --git a/lightllm/common/fused_moe/topk_select.py b/lightllm/common/fused_moe/topk_select.py
@@ -23,8 +23,7 @@
 from lightllm.utils.light_utils import light_ops
 from typing import Callable, List, Optional, Tuple
 from lightllm.common.fused_moe.softmax_topk import softmax_topk
-from lightllm.common.triton_utils.autotuner import AutotuneLevel
-from lightllm.utils.envs_utils import get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import Autotuner
 
 use_cuda_grouped_topk = os.getenv("LIGHTLLM_CUDA_GROUPED_TOPK", "False").upper() in ["ON", "TRUE", "1"]
 
@@ -225,7 +224,7 @@ def select_experts(
 
     ######################################## warning ##################################################
     # here is used to match autotune feature, make topk_ids more random
-    if get_triton_autotune_level() in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+    if Autotuner.is_autotune_warmup():
         rand_gen = torch.Generator(device="cuda")
         rand_gen.manual_seed(router_logits.shape[0])
         router_logits = torch.randn(size=router_logits.shape, generator=rand_gen, dtype=torch.float32, device="cuda")
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -77,6 +77,22 @@ def decorator(fn):
 
 
 class Autotuner:
+    _autotune_warmup: bool = False
+
+    @staticmethod
+    def start_autotune_warmup():
+        Autotuner._autotune_warmup = True
+        return
+
+    @staticmethod
+    def end_autotune_warmup():
+        Autotuner._autotune_warmup = False
+        return
+
+    @staticmethod
+    def is_autotune_warmup():
+        return Autotuner._autotune_warmup
+
     def __init__(
         self,
         fn,
@@ -104,6 +120,7 @@ def __init__(
         self.run_key_distance_func = run_key_distance_func
         self.cached_configs = {}
         self.fast_match_configs = collections.defaultdict(dict)
+        self.warmuped_configs_set = set()
         self.arg_names = [param.name for param in inspect.signature(self.fn).parameters.values()]
         self._argname_to_pos = {name: idx for idx, name in enumerate(self.arg_names)}
         self._pos_to_argname = {idx: name for idx, name in enumerate(self.arg_names)}
@@ -139,7 +156,13 @@ def __call__(self, *args, **kwargs):
         run_key = str(self._run_key(*args, **kwargs))
 
         # Lazy load the cached configs in lightllm/common/triton_utils/autotune_kernel_configs
-        self._try_load_cache(static_key)
+        if self._try_load_cache(static_key) or Autotuner.is_autotune_warmup():
+            all_configs = self.cached_configs.get(static_key, {})
+            for run_config in all_configs.values():
+                # warmup all configs
+                _copy_kwargs = kwargs.copy()
+                _copy_kwargs["run_config"] = run_config
+                self.kernel_warmup(static_key, *args, **_copy_kwargs)
 
         if static_key not in self.cached_configs and autotune_level == AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG:
             if (dist.is_initialized() and get_current_rank_in_node() == 0) or not dist.is_initialized():
@@ -150,7 +173,10 @@ def __call__(self, *args, **kwargs):
                 )
             self.cached_configs[static_key] = {}
 
-        if autotune_level in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]:
+        if (
+            autotune_level in [AutotuneLevel.ADAPTIVE_AUTOTUNE, AutotuneLevel.FORCE_AUTOTUNE]
+            and Autotuner.is_autotune_warmup()
+        ):
             need_tuning = (autotune_level == AutotuneLevel.FORCE_AUTOTUNE) or (
                 run_key not in self.cached_configs.get(static_key, {})
             )
@@ -185,13 +211,28 @@ def __call__(self, *args, **kwargs):
 
     def _try_load_cache(self, static_key):
         if static_key in self.cached_configs:
-            return
+            return False
 
         cache_file = os.path.join(self.cache_dir, KernelConfigs.get_config_file_name(static_key))
         if os.path.exists(cache_file):
             logger.info(f"Loading cached configs for {self.kernel_name} - {static_key}")
             with open(cache_file, "rb") as f:
                 self.cached_configs[static_key] = orjson.loads(f.read())
+        return True
+
+    def kernel_warmup(self, static_key, *args, **kwargs):
+        new_args, new_kwargs, origin_list, new_list = self._mutate_args_clone(args, kwargs)
+        run_config = kwargs.get("run_config", {})
+        hash_key = str(frozendict(run_config)) + str(static_key)
+        if hash_key in self.warmuped_configs_set:
+            return
+        try:
+            self.fn(*new_args, **new_kwargs)
+            self.warmuped_configs_set.add(hash_key)
+        except:
+            pass
+        finally:
+            self._recover_mutated_args(origin_list=origin_list, new_list=new_list)
         return
 
     def _bench(self, *args, n_repeat=3, n_retries=3, **kwargs):
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -149,15 +149,11 @@ def get_kv_quant_calibration_inference_count():
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_INFERENCE_COUNT", 4000))
 
 
+@lru_cache(maxsize=None)
 def get_triton_autotune_level():
     return int(os.getenv("LIGHTLLM_TRITON_AUTOTUNE_LEVEL", 0))
 
 
-def set_triton_autotune_level(level: int):
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE_LEVEL"] = str(level)
-    return
-
-
 g_model_init_done = False