clean code

sufubao · sufubao · commit f7f8a54f6100 · 2025-09-05T14:40:30.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -27,7 +27,8 @@
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, set_triton_autotune_warmup
+from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, get_triton_autotune_level
+from lightllm.common.triton_utils.autotuner import Autotuner
 from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
@@ -733,7 +734,7 @@ def autotune_layers(self):
     @torch.no_grad()
     @post_empty_cache
     def _autotune_warmup(self):
-        set_triton_autotune_warmup(1)
+        Autotuner.start_autotune_warmup()
         torch.distributed.barrier()
 
         warmup_lengths = [1, 8, 16, 32, 64, 100, 128, 256, 1024, 2048, 4096]
@@ -792,8 +793,7 @@ def _autotune_warmup(self):
                 torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        set_triton_autotune_level(AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG)
-        set_triton_autotune_warmup(0)
+        Autotuner.end_autotune_warmup()
 
     @final
     @torch.no_grad()
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -576,7 +576,7 @@ def grouped_matmul(
     if block_size_k != 0:
         # 如果使用了 block wise 量化，分块大小不能超过 block size
         BLOCK_SIZE_K = min(BLOCK_SIZE_K, block_size_k)
-        BLOCK_SIZE_K = triton.next_power_of_2(BLOCK_SIZE_K)
+        assert BLOCK_SIZE_K == triton.next_power_of_2(BLOCK_SIZE_K)
 
     if use_fp8_w8a8:
         # 当权重使用 block wise 量化时，激活也使用 per token， group size 量化
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -77,6 +77,22 @@ def decorator(fn):
 
 
 class Autotuner:
+    _autotune_warmup: bool = False
+
+    @staticmethod
+    def start_autotune_warmup():
+        Autotuner._autotune_warmup = True
+        return
+
+    @staticmethod
+    def end_autotune_warmup():
+        Autotuner._autotune_warmup = False
+        return
+
+    @staticmethod
+    def is_autotune_warmup():
+        return Autotuner._autotune_warmup
+
     def __init__(
         self,
         fn,
@@ -139,12 +155,12 @@ def __call__(self, *args, **kwargs):
         run_key = str(self._run_key(*args, **kwargs))
 
         # Lazy load the cached configs in lightllm/common/triton_utils/autotune_kernel_configs
-        if self._try_load_cache(static_key) or get_triton_autotune_warmup():
+        if self._try_load_cache(static_key) or Autotuner.is_autotune_warmup():
             all_configs = self.cached_configs.get(static_key, {})
             for run_config in all_configs.values():
-                # warmup
+                # warmup all configs
                 kwargs["run_config"] = run_config
-                self._bench(*args, n_repeat=1, n_retries=1, warmup=True, **kwargs)
+                self.kernel_warmup(*args, **kwargs)
 
         if static_key not in self.cached_configs and autotune_level == AutotuneLevel.USE_AUTOTUNE_HIS_CONFIG:
             if (dist.is_initialized() and get_current_rank_in_node() == 0) or not dist.is_initialized():
@@ -199,7 +215,18 @@ def _try_load_cache(self, static_key):
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return True
 
-    def _bench(self, *args, n_repeat=3, n_retries=3, warmup=False, **kwargs):
+    def kernel_warmup(self, *args, **kwargs):
+        new_args, new_kwargs, origin_list, new_list = self._mutate_args_clone(args, kwargs)
+
+        try:
+            self.fn(*new_args, **new_kwargs)
+        except:
+            pass
+        finally:
+            self._recover_mutated_args(origin_list=origin_list, new_list=new_list)
+        return
+
+    def _bench(self, *args, n_repeat=3, n_retries=3, **kwargs):
         from triton.compiler.errors import CompileTimeAssertionFailure
         from triton.runtime.errors import OutOfResources, PTXASError
 
@@ -209,16 +236,13 @@ def kernel_call():
             try:
                 self.fn(*new_args, **new_kwargs)
             except Exception as e:
-                print(f"error: {e}")
                 raise e
             finally:
                 self._recover_mutated_args(origin_list=origin_list, new_list=new_list)
 
         try:
             # warmup
             kernel_call()
-            if warmup:
-                return
 
             torch.cuda.current_stream().synchronize()
             g = torch.cuda.CUDAGraph()
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -149,24 +149,11 @@ def get_kv_quant_calibration_inference_count():
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_INFERENCE_COUNT", 4000))
 
 
+@lru_cache(maxsize=None)
 def get_triton_autotune_level():
     return int(os.getenv("LIGHTLLM_TRITON_AUTOTUNE_LEVEL", 0))
 
 
-def set_triton_autotune_level(level: int):
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE_LEVEL"] = str(level)
-    return
-
-
-def set_triton_autotune_warmup(warmup: int):
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE_WARMUP"] = str(warmup)
-    return
-
-
-def get_triton_autotune_warmup():
-    return os.getenv("LIGHTLLM_TRITON_AUTOTUNE_WARMUP", "0") == "1"
-
-
 g_model_init_done = False