add AutotuneLevel for more detailed autotune

sufubao · sufubao · commit d6695ef348b5 · 2025-09-01T10:52:43.000+08:00
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -24,8 +24,9 @@
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed.communication_op import dist_group_manager
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
+from lightllm.common.triton_utils.autotuner import AutotuneLevel
 from lightllm.utils.custom_kernel_utis import pad2dim_tensor_to_new_batch
-from lightllm.utils.envs_utils import set_model_init_status, is_triton_autotune_enabled, disable_triton_autotune
+from lightllm.utils.envs_utils import set_model_init_status, set_triton_autotune_level, get_triton_autotune_level
 from lightllm.utils.infer_utils import post_empty_cache
 
 logger = init_logger(__name__)
@@ -731,7 +732,7 @@ def autotune_layers(self):
     @torch.no_grad()
     @post_empty_cache
     def _autotune_warmup(self):
-        if not is_triton_autotune_enabled():
+        if get_triton_autotune_level() in [AutotuneLevel.NO_AUTOTUNE, AutotuneLevel.CLOSE_AUTOTUNE]:
             return
 
         torch.distributed.barrier()
@@ -794,7 +795,8 @@ def _autotune_warmup(self):
                 torch.cuda.empty_cache()
         self.layers_num = layer_num_bak
         torch.distributed.barrier()
-        disable_triton_autotune()
+        if get_triton_autotune_level() not in [AutotuneLevel.AUTOTUNE_RUNTIME, AutotuneLevel.AUTOTUNE_RUNTIME_OVERWRITE]:
+            set_triton_autotune_level(AutotuneLevel.NO_AUTOTUNE)
 
     @final
     @torch.no_grad()
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -35,7 +35,7 @@
 from .moe_sum_reduce import moe_sum_reduce
 from lightllm.common.quantization.triton_quant.fp8.fp8act_quant_kernel import per_token_group_quant_fp8
 from lightllm.utils.torch_ops_utils import direct_register_custom_op
-from lightllm.common.triton_utils.autotuner import autotune
+from lightllm.common.triton_utils.autotuner import autotune, closest_pow_of_2
 
 FFN_MOE_CHUNK_SIZE = 32 * 1024
 
@@ -492,7 +492,7 @@ def _get_grouped_matmul_configs():
     kernel_name="grouped_matmul:v1",
     configs_gen_func=_get_grouped_matmul_configs,
     static_key_func=_get_grouped_matmul_static_key,
-    run_key_func=lambda token_inputs: token_inputs.shape[0],
+    run_key_func=lambda token_inputs: closest_pow_of_2(token_inputs.shape[0]),
     mutates_args=["out"],
 )
 def grouped_matmul(
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -3,7 +3,7 @@
 import triton
 import triton.language as tl
 from .moe_silu_and_mul_config import MoeSiluAndMulKernelConfig
-from lightllm.common.triton_utils.autotuner import autotune
+from lightllm.common.triton_utils.autotuner import autotune, closest_pow_of_2
 
 
 @triton.jit
@@ -81,7 +81,7 @@ def _get_silu_and_mul_static_key(input: torch.Tensor, output: torch.Tensor):
     kernel_name="silu_and_mul_fwd:v1",
     configs_gen_func=_get_silu_and_mul_configs,
     static_key_func=_get_silu_and_mul_static_key,
-    run_key_func=lambda input: input.shape[0],
+    run_key_func=lambda input: closest_pow_of_2(input.shape[0]),
     mutates_args=["output"],
 )
 def silu_and_mul_fwd(input: torch.Tensor, output: torch.Tensor, run_config=None):
diff --git a/lightllm/common/fused_moe/moe_sum_reduce.py b/lightllm/common/fused_moe/moe_sum_reduce.py
@@ -4,7 +4,7 @@
 import triton.language as tl
 from .moe_sum_recude_config import MoeSumReduceKernelConfig
 from typing import Any, Callable, Dict, Optional, Tuple
-from lightllm.common.triton_utils.autotuner import autotune
+from lightllm.common.triton_utils.autotuner import autotune, closest_pow_of_2
 
 
 @triton.jit
@@ -66,7 +66,7 @@ def _get_moe_sum_reduce_configs():
     kernel_name="moe_sum_reduce:v1",
     configs_gen_func=_get_moe_sum_reduce_configs,
     static_key_func=_get_moe_sum_reduce_static_key,
-    run_key_func=lambda input: input.shape[0],
+    run_key_func=lambda input: closest_pow_of_2(input.shape[0]),
     mutates_args=["output"],
 )
 def moe_sum_reduce(input: torch.Tensor, output: torch.Tensor, run_config: Dict = None):
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py
@@ -7,7 +7,7 @@
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple
 from triton import Config
-from lightllm.common.triton_utils.autotuner import autotune
+from lightllm.common.triton_utils.autotuner import autotune, closest_pow_of_2
 
 
 class Fp8BlockMMKernelConfig(KernelConfigs):
@@ -180,7 +180,7 @@ def _get_static_key(A, B, block_size, dtype):
     kernel_name="w8a8_block_fp8_matmul:v1",
     configs_gen_func=get_test_configs,
     static_key_func=_get_static_key,
-    run_key_func=lambda A: A.shape[0],
+    run_key_func=lambda A: closest_pow_of_2(A.shape[0]),
     mutates_args=["C"],
 )
 def w8a8_block_fp8_matmul(
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -12,14 +12,33 @@
 from lightllm.utils.device_utils import get_current_device_name
 from lightllm.utils.log_utils import init_logger
 from typing import Callable, Optional, Union, List
-from lightllm.utils.envs_utils import is_triton_autotune_enabled
+from lightllm.utils.envs_utils import get_triton_autotune_level
 from lightllm.common.kernel_config import KernelConfigs
 from lightllm.utils.dist_utils import get_global_world_size, get_global_rank, get_current_rank_in_node
-from lightllm.distributed.communication_op import dist_group_manager
 
 logger = init_logger(__name__)
 
 
+def _get_autotune_group():
+    from lightllm.distributed.communication_op import dist_group_manager
+    return dist_group_manager.get_default_group().autotune_group
+
+
+class AutotuneLevel:
+    # Do not autotune, only use the config of cached files.
+    NO_AUTOTUNE = 0
+    # Autotune if no config is cached.
+    AUTOTUNE = 1
+    # Autotune anyway to overwrite the config of cached files.
+    AUTOTUNE_OVERWRITE = 2
+    # Auotune in runtime to search for more better config.
+    AUTOTUNE_RUNTIME = 3
+    # Autotune in runtime to search for more better config and overwrite the config of cached files.
+    AUTOTUNE_RUNTIME_OVERWRITE = 4
+    # Close autotune and not use the config of cached files.
+    CLOSE_AUTOTUNE = 5
+
+
 def autotune(
     kernel_name: str,
     configs_gen_func: Callable[[], List],
@@ -28,6 +47,29 @@ def autotune(
     run_key_distance_func: Callable = lambda run_key, config_key: abs(int(run_key) - int(config_key)),
     mutates_args: List[str] = [],
 ):
+    """Decorator that constructs and returns an Autotuner wrapper for a Triton kernel.
+
+    This decorator configures an Autotuner with the provided configuration
+    generator and key functions, enabling on-demand benchmarking and caching
+    of kernel run configurations across runs and processes.
+
+    Args:
+        kernel_name (str): Human-readable kernel name used for logging and cache paths.
+        configs_gen_func (Callable[[], List]): Function that returns candidate run configurations.
+        static_key_func (Callable): Function that derives a static key (dict-like) from call arguments.
+            This key identifies the cache file that stores tuned configs.
+        run_key_func (Callable): Function that derives a run-time key from call arguments.
+            This key indexes tuned configs within a static key's cache.
+        run_key_distance_func (Callable, optional): Distance metric taking ``(run_key, config_key)`` and
+            returning a comparable value; used to pick the closest config when an exact match is absent.
+            Defaults to ``abs(int(run_key) - int(config_key))``.
+        mutates_args (List[str], optional): Names of arguments that can be mutated by the kernel.
+            During benchmarking, defensive clones are made to avoid side effects. Defaults to ``[]``.
+
+    Returns:
+        Callable: A callable object that wraps the original function and performs autotuning
+        as needed before invocation.
+    """
     def decorator(fn):
         return Autotuner(
             fn=fn,
@@ -53,8 +95,7 @@ def __init__(
         run_key_distance_func: Callable = lambda run_key, config_key: abs(int(run_key) - int(config_key)),
         mutates_args: List[str] = [],
     ):
-        # Whether to use this autotune decorator
-        self.disable_autotune = not is_triton_autotune_enabled()
+        self.autotune_level = get_triton_autotune_level()
 
         self.configs_gen_func = configs_gen_func
         self.kernel_name = kernel_name
@@ -65,7 +106,6 @@ def __init__(
             get_current_device_name(),
             self.kernel_name,
         )
-        os.makedirs(self.cache_dir, exist_ok=True)
         self.fn = fn
         self.static_key_func = static_key_func
         self.run_key_func = run_key_func
@@ -81,38 +121,42 @@ def __init__(
         ]
         self._run_key_func_param_names = [name for name, _ in inspect.signature(self.run_key_func).parameters.items()]
         self.mutates_args = mutates_args
+        
+        assert self.autotune_level in [AutotuneLevel.NO_AUTOTUNE, AutotuneLevel.AUTOTUNE, AutotuneLevel.AUTOTUNE_OVERWRITE, AutotuneLevel.AUTOTUNE_RUNTIME, AutotuneLevel.AUTOTUNE_RUNTIME_OVERWRITE, AutotuneLevel.CLOSE_AUTOTUNE]
         return
 
     @torch.no_grad()
     def __call__(self, *args, **kwargs):
         if kwargs.get("run_config", None) is not None:
             return self.fn(*args, **kwargs)
 
-        if self.disable_autotune:
+        # if the autotune_level is AutotuneLevel.CLOSE_AUTOTUNE, ignore the autotune
+        if self.autotune_level == AutotuneLevel.CLOSE_AUTOTUNE:
             return self.fn(*args, **kwargs)
 
         rank_id = 0 if not dist.is_initialized() else get_global_rank()
         world_size = 1 if not dist.is_initialized() else get_global_world_size()
 
-        static_key = self._static_key(*args, **kwargs)
+        static_key = frozendict(self._static_key(*args, **kwargs))
         run_key = str(self._run_key(*args, **kwargs))
 
-        # Lazy load
-        self._try_load_cache(static_key)
+        # Lazy load the cached configs in lightllm/common/triton_utils/autotune_kernel_configs
+        if self.autotune_level not in [AutotuneLevel.AUTOTUNE_OVERWRITE, AutotuneLevel.AUTOTUNE_RUNTIME_OVERWRITE]:
+            self._try_load_cache(static_key)
 
-        if static_key not in self.cached_configs:
+        if static_key not in self.cached_configs and self.autotune_level == AutotuneLevel.NO_AUTOTUNE:
             if (dist.is_initialized() and get_current_rank_in_node() == 0) or not dist.is_initialized():
                 logger.warning(
                     f"No kernel config for {self.kernel_name} in {KernelConfigs.get_config_file_name(static_key)}",
                 )
             self.cached_configs[static_key] = {}
 
-        if is_triton_autotune_enabled():
+        if self.autotune_level != AutotuneLevel.NO_AUTOTUNE:
             need_tunning = run_key not in self.cached_configs.get(static_key, {})
             if world_size > 1:
                 _need_tunnings = [None for _ in range(world_size)]
                 dist.all_gather_object(
-                    _need_tunnings, obj=need_tunning, group=dist_group_manager.get_default_group().autotune_group
+                    _need_tunnings, obj=need_tunning, group=_get_autotune_group()
                 )
                 need_tunning = any(_need_tunnings)
             if need_tunning:
@@ -125,12 +169,12 @@ def __call__(self, *args, **kwargs):
                     world_size=world_size,
                 )
 
-        if static_key in self.fast_match_configs and run_key in self.fast_match_configs[static_key]:
-            closest_config = self.fast_match_configs[static_key][run_key]
-            kwargs["run_config"] = closest_config
+        fast_for_key = self.fast_match_configs.get(static_key)
+        if fast_for_key is not None and run_key in fast_for_key:
+            kwargs["run_config"] = fast_for_key[run_key]
             return self.fn(*args, **kwargs)
 
-        all_configs = self.cached_configs.get(static_key)
+        all_configs = self.cached_configs.get(static_key, {})
         if len(all_configs) != 0:
             closest_config = min(
                 list(all_configs.items()), key=lambda item: self.run_key_distance_func(run_key, item[0])
@@ -146,6 +190,7 @@ def _try_load_cache(self, static_key):
 
         cache_file = os.path.join(self.cache_dir, KernelConfigs.get_config_file_name(static_key))
         if os.path.exists(cache_file):
+            logger.info(f"Loading cached configs for {self.kernel_name} - {static_key}")
             with open(cache_file, "rb") as f:
                 self.cached_configs[static_key] = orjson.loads(f.read())
         return
@@ -195,7 +240,7 @@ def _autotune(self, args, kwargs, static_key, run_key, rank_id, world_size):
             all_keys = [None for _ in range(world_size)]
             all_key_str = f"{run_key}_{static_key}"
             dist.all_gather_object(
-                all_keys, obj=all_key_str, group=dist_group_manager.get_default_group().autotune_group
+                all_keys, obj=all_key_str, group=_get_autotune_group()
             )
             is_key_all_same = all(all_keys[0] == k for k in all_keys)
             if not is_key_all_same:
@@ -237,7 +282,7 @@ def _autotune(self, args, kwargs, static_key, run_key, rank_id, world_size):
             dist.all_gather_object(
                 all_gather_configs,
                 obj=(best_time, run_key, dict(static_key), best_config),
-                group=dist_group_manager.get_default_group().autotune_group,
+                group=_get_autotune_group(),
             )
             all_gather_configs = sorted(all_gather_configs, key=lambda x: x[0])
             key_set = set()
@@ -318,8 +363,7 @@ def _select_args(self, param_names, args, kwargs):
 
     def _static_key(self, *args, **kwargs):
         params = self._select_args(self._static_key_func_param_names, args, kwargs)
-        key = self.static_key_func(*params)
-        return frozendict(key)
+        return self.static_key_func(*params)
 
     def _run_key(self, *args, **kwargs):
         params = self._select_args(self._run_key_func_param_names, args, kwargs)
@@ -347,3 +391,7 @@ def get_triton_version():
 def split_configs(configs, global_rank, global_world_size):
     random.Random(0).shuffle(configs)
     return configs[global_rank::global_world_size]
+
+
+def closest_pow_of_2(x):
+    return triton.next_power_of_two(x - triton.next_power_of_two(x)//4)
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py
@@ -3,7 +3,7 @@
 import triton
 import triton.language as tl
 import itertools
-from lightllm.common.triton_utils.autotuner import autotune
+from lightllm.common.triton_utils.autotuner import autotune, closest_pow_of_2
 
 
 @triton.jit
@@ -122,7 +122,7 @@ def get_static_key(q, k):
     kernel_name="rotary_emb_fwd:v1",
     configs_gen_func=get_test_configs,
     static_key_func=get_static_key,
-    run_key_func=lambda q: q.shape[0],
+    run_key_func=lambda q: closest_pow_of_2(q.shape[0]),
     mutates_args=["q", "k"],
 )
 @torch.no_grad()
diff --git a/lightllm/utils/envs_utils.py b/lightllm/utils/envs_utils.py
@@ -149,15 +149,13 @@ def get_kv_quant_calibration_inference_count():
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_INFERENCE_COUNT", 4000))
 
 
-def is_triton_autotune_enabled():
-    # Whether Triton autotune is enabled (read-only check)
-    mark = os.getenv("LIGHTLLM_TRITON_AUTOTUNE", "False").upper() in ["ON", "TRUE", "1"]
-    return mark
+def get_triton_autotune_level():
+    return int(os.getenv("LIGHTLLM_TRITON_AUTOTUNE_LEVEL", 0))
 
 
-def disable_triton_autotune():
-    # Disable Triton autotune (setter)
-    os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "False"
+def set_triton_autotune_level(level: int):
+    os.environ["LIGHTLLM_TRITON_AUTOTUNE_LEVEL"] = str(level)
+    return
 
 
 g_model_init_done = False