fix

sufubao · sufubao · commit a6b2f08a7aee · 2025-08-26T11:44:27.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -489,14 +489,6 @@ def get_grouped_matmul_static_key(
         for bn in [16, 32, 64, 128]
         for bk in [16, 32, 64, 128]
     ],
-    default_config={
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 32,
-        "GROUP_SIZE_M": 8,
-        "num_warps": 4,
-        "num_stages": 1,
-    },
     static_key_func=get_grouped_matmul_static_key,
     run_key_func=lambda token_num_mul_topk_num: str(nearest_power_of_2(token_num_mul_topk_num)),
 )
diff --git a/lightllm/common/fused_moe/moe_silu_and_mul.py b/lightllm/common/fused_moe/moe_silu_and_mul.py
@@ -72,7 +72,6 @@ def _silu_and_mul_kernel_fast(
         for bm in [32, 64, 128, 256]
         for bn in [32, 64, 128, 256]
     ],
-    default_config={"BLOCK_M": 128, "BLOCK_N": 128, "num_warps": 4, "NUM_STAGES": 1},
     static_key_func=lambda input, output: f"N={input.shape[-1] // 2},out_dtype={output.dtype}",
     run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),
 )
diff --git a/lightllm/common/fused_moe/moe_sum_reduce.py b/lightllm/common/fused_moe/moe_sum_reduce.py
@@ -61,7 +61,6 @@ def get_static_key(input, output):
         for bm in [1, 2, 4, 8, 16, 32]
         for bd in [64, 128, 256, 512, 1024]
     ],
-    default_config={"BLOCK_M": 1, "BLOCK_DIM": 128, "NUM_STAGE": 1, "num_warps": 2},
     static_key_func=get_static_key,
     run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),
 )
diff --git a/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py b/lightllm/common/quantization/triton_quant/fp8/fp8w8a8_block_gemm_kernel.py
@@ -179,7 +179,6 @@ def get_static_key(A, B, block_size, dtype):
 @autotune(
     name="w8a8_block_fp8_matmul:v1",
     configs=get_test_configs(),
-    default_config={"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 32, "num_stages": 3, "num_warps": 4},
     static_key_func=get_static_key,
     run_key_func=lambda M: str(nearest_power_of_2(M)),
 )
diff --git a/lightllm/common/triton_utils/autotuner.py b/lightllm/common/triton_utils/autotuner.py
@@ -19,7 +19,6 @@
 def autotune(
     name,
     configs,
-    default_config,
     static_key_func=None,
     run_key_func=None,
     reset_to_zero=None,
@@ -34,7 +33,6 @@ def decorator(fn):
             arg_names,
             name,
             configs,
-            default_config,
             static_key_func,
             run_key_func,
             reset_to_zero,
@@ -46,6 +44,16 @@ def decorator(fn):
     return decorator
 
 
+def is_triton_autotune_enabled():
+    # Whether Triton autotune is enabled (read-only check)
+    return os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1"
+
+
+def disable_triton_autotune():
+    # Disable Triton autotune (setter)
+    os.environ["LIGHTLLM_TRITON_AUTOTUNE"] = "0"
+
+
 class Autotuner:
     @staticmethod
     def _get_param_names(func):
@@ -72,7 +80,6 @@ def __init__(
         arg_names,
         name,
         configs,
-        default_config,
         static_key_func,
         run_key_func,
         reset_to_zero,
@@ -82,11 +89,13 @@ def __init__(
         warmup=None,
         rep=None,
     ):
-        # 是否打印autotune信息
+        # Whether to print autotune logs
         self.print_autotune = os.environ.get("LIGHTLLM_TRITON_PRINT_AUTOTUNE", "0") == "1"
+        # Whether to use this autotune decorator
+        self.disable_autotune = os.environ.get("DISABLE_AUTOTUNE_DECORATOR", "0") == "1"
+
         self.all_configs = configs
         self.configs = None
-        self.default_config = default_config
         self.name = name
         self.cache_dir = os.path.join(
             Path(__file__).parent, "all_kernel_configs", get_triton_version(), get_current_device_name(), self.name
@@ -95,11 +104,6 @@ def __init__(
         self.static_key_func = static_key_func
         self.run_key_func = run_key_func
 
-        # 是否使用之前配置
-        self.can_be_none = os.environ.get("DISABLE_MANUAL_TUNE_CONFIG", "0") == "0"
-        # 是否使用autotune注解
-        self.disable_autotune = os.environ.get("DISABLE_AUTOTUNE_ANNOTATION", "0") == "1"
-
         self.cached_configs = {}
         self.arg_names = arg_names
         self._argname_to_pos = {name: idx for idx, name in enumerate(self.arg_names)}
@@ -165,7 +169,6 @@ def _ensure_cache_loaded(self, static_key: str):
                     ]
                     self.sorted_cached_configs[static_key].sort(key=lambda x: x[0])
             except Exception:
-                # 若缓存损坏，忽略并在之后覆盖
                 self.cached_configs[static_key] = {}
         self._loaded_static_keys.add(static_key)
 
@@ -229,9 +232,8 @@ def __call__(self, *args, **kwargs):
         static_key = self._static_key(*args, **kwargs)
         run_key = self._run_key(*args, **kwargs)
 
-        # 懒加载
+        # Lazy load
         self._ensure_cache_loaded(static_key)
-        best_config = None
         self.nargs = dict(zip(self.arg_names, args))
 
         def _benchmark(_run_key):
@@ -300,30 +302,30 @@ def _benchmark(_run_key):
             full_nargs = {**self.nargs, **kwargs}
             self.pre_hook(full_nargs, reset_only=True)
 
-        best_config = self.cached_configs.get(static_key, {}).get(run_key)
+        if static_key not in self.cached_configs:
+            if not is_triton_autotune_enabled():
+                logger.warning(
+                    f"No kernel config for {self.name} in {self.cache_dir}/{static_key}, using default config",
+                )
+            self.cached_configs[static_key] = {}
+
+        all_configs = self.cached_configs.get(static_key)
+        best_config = all_configs.get(run_key)
+
+        if best_config is not None:
+            kwargs["run_config"] = best_config
+            return self.fn(*args, **kwargs)
+
+        if is_triton_autotune_enabled():
+            _benchmark(run_key)
+            kwargs["run_config"] = self.cached_configs.get(static_key, {}).get(run_key)
+            return self.fn(*args, **kwargs)
+
+        if all_configs != {}:
+            closest_config = min(all_configs, key=lambda x: abs(int(x[0]) - int(run_key)))[1]
+            self.cached_configs[static_key][run_key] = closest_config
+            kwargs["run_config"] = closest_config
 
-        if best_config is None:
-            if os.environ.get("LIGHTLLM_TRITON_AUTOTUNE", "0") == "1":
-                _benchmark(run_key)
-            elif not self.can_be_none:
-                cached_for_static = self.cached_configs.setdefault(static_key, {})
-                if static_key in self.sorted_cached_configs and self.sorted_cached_configs[static_key]:
-                    sorted_configs = self.sorted_cached_configs[static_key]
-                    try:
-                        target = int(run_key)
-                        cached_for_static[run_key] = min(sorted_configs, key=lambda x: abs(x[0] - target))[1]
-                    except Exception:
-                        cached_for_static[run_key] = self.default_config
-                else:
-                    if static_key not in self.sorted_cached_configs:
-                        logger.warning(
-                            f"No kernel config for {self.name} in {self.cache_dir}/{static_key}, using default config"
-                        )
-                    cached_for_static[run_key] = self.default_config
-
-                best_config = self.cached_configs[static_key][run_key]
-
-        kwargs["run_config"] = best_config
         return self.fn(*args, **kwargs)
 
     def _select_args(self, param_names, args, kwargs):
@@ -412,7 +414,7 @@ def dict_to_filename(data):
 
 
 def nearest_power_of_2(x):
-    # 返回最接近 x 的 2 的幂次方
+    # Return the power of two closest to x
     if x <= 1:
         return 1
     return triton.next_power_of_2(x - triton.next_power_of_2(x) // 4)
diff --git a/lightllm/models/deepseek2/triton_kernel/rotary_emb.py b/lightllm/models/deepseek2/triton_kernel/rotary_emb.py
@@ -121,7 +121,6 @@ def get_static_key(q, k):
 @autotune(
     name="rotary_emb_fwd:v1",
     configs=get_test_configs(),
-    default_config={"BLOCK_SEQ": 16, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1},
     static_key_func=get_static_key,
     run_key_func=lambda q: str(nearest_power_of_2(q.shape[0])),
 )

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,6 @@ def _silu_and_mul_kernel_fast(`
`72`	`72`	`for bm in [32, 64, 128, 256]`
`73`	`73`	`for bn in [32, 64, 128, 256]`
`74`	`74`	`],`
`75`		`- default_config={"BLOCK_M": 128, "BLOCK_N": 128, "num_warps": 4, "NUM_STAGES": 1},`
`76`	`75`	`static_key_func=lambda input, output: f"N={input.shape[-1] // 2},out_dtype={output.dtype}",`
`77`	`76`	`run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),`
`78`	`77`	`)`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,6 @@ def get_static_key(input, output):`
`61`	`61`	`for bm in [1, 2, 4, 8, 16, 32]`
`62`	`62`	`for bd in [64, 128, 256, 512, 1024]`
`63`	`63`	`],`
`64`		`- default_config={"BLOCK_M": 1, "BLOCK_DIM": 128, "NUM_STAGE": 1, "num_warps": 2},`
`65`	`64`	`static_key_func=get_static_key,`
`66`	`65`	`run_key_func=lambda input: str(nearest_power_of_2(input.shape[0])),`
`67`	`66`	`)`
Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,6 @@ def get_static_key(A, B, block_size, dtype):`
`179`	`179`	`@autotune(`
`180`	`180`	`name="w8a8_block_fp8_matmul:v1",`
`181`	`181`	`configs=get_test_configs(),`
`182`		`- default_config={"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 128, "GROUP_M": 32, "num_stages": 3, "num_warps": 4},`
`183`	`182`	`static_key_func=get_static_key,`
`184`	`183`	`run_key_func=lambda M: str(nearest_power_of_2(M)),`
`185`	`184`	`)`
Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,6 @@ def get_static_key(q, k):`
`121`	`121`	`@autotune(`
`122`	`122`	`name="rotary_emb_fwd:v1",`
`123`	`123`	`configs=get_test_configs(),`
`124`		`- default_config={"BLOCK_SEQ": 16, "NUM_STAGE": 1, "num_warps": 1, "num_stages": 1, "HEAD_PARALLEL_NUM": 1},`
`125`	`124`	`static_key_func=get_static_key,`
`126`	`125`	`run_key_func=lambda q: str(nearest_power_of_2(q.shape[0])),`
`127`	`126`	`)`