feat: Support loading autotuned results from json for cutlass fp4 moe backends (#1310)

kaixih · web-flow · commit bc1a041bc4b8 · 2025-07-29T14:56:37.000-07:00
This PR adds support for loading autotuned results from JSON files for the Cutlass FP4 MoE backends. The script `benchmarks/bench_cutlass_fused_moe.py` generates a JSON file at `configs/<flashinfer_version>/trtllm_fused_moe_<device_name>.json`, mapping input shapes to the optimal config/tactic for GEMMs used in `fused_moe.cutlass_fused_moe`. At runtime, setting the `FLASHINFER_AUTOTUNER_LOAD_FROM_FILE` environment variable enables loading from this file. If the variable is unset or a matching entry is not found, it falls back to the default config/tactic. Configs are organized by flashinfer version and GPU device. cc. @yzh119 @wenscarl @kushanam
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,6 +39,7 @@ repos:
     rev: 24.8.0
     hooks:
       - id: black
+        exclude: flashinfer/tuning_configs/.*\.py
 
   - repo: https://github.com/pycqa/isort
     rev: 5.13.2
diff --git a/benchmarks/bench_cutlass_fused_moe.py b/benchmarks/bench_cutlass_fused_moe.py
@@ -14,43 +14,20 @@
 limitations under the License.
 """
 
+import argparse
+import pprint
+
 import torch
 from torch.nn import functional as F
-from triton.testing import do_bench
 
-import flashinfer
 import flashinfer.fused_moe as fused_moe
 from flashinfer import fp4_quantize
+from flashinfer.autotuner import AutoTuner, autotune, get_config_path
+from flashinfer.testing.utils import bench_gpu_time_with_cudagraph
 
-BATCH_SIZES = [
-    1,
-    2,
-    4,
-    8,
-    16,
-    24,
-    32,
-    48,
-    64,
-    96,
-    128,
-    256,
-    512,
-    1024,
-    1536,
-    2048,
-    3072,
-    4096,
-]
-
-configs = []
-hidden_size = 7168
-num_experts = [32, 256]
-top_k = [8]
-intermediate_size = [256, 2048]
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
-FP8_DTYPE = torch.float8_e4m3fn
+
 
 test_configs = [
     {
@@ -96,6 +73,7 @@ def bench_cutlass_fused_moe(
     num_experts,
     top_k,
     intermediate_size,
+    skip_autotune,
 ):
     torch.manual_seed(42)
     quant_blocksize = 16
@@ -165,12 +143,24 @@ def bench_cutlass_fused_moe(
     ]
     hidden_states = x
     hidden_states, input_sf = fp4_quantize(x, a1_gs)
-    repeats = 3
-    from flashinfer.autotuner import AutoTuner, autotune
 
-    AutoTuner.get().clear_cache()
-    with torch.inference_mode(), autotune():
-        for _ in range(2):
+    # Warmup
+    for _ in range(3):
+        _ = fused_moe.cutlass_fused_moe(
+            hidden_states,
+            selected_experts.to(torch.int),
+            routing_weights,
+            w1_q.contiguous().view(torch.long),
+            w2_q.contiguous().view(torch.long),
+            otype,
+            quant_scales=quant_scales,
+            input_sf=input_sf,
+            output=flash_output,
+            tune_max_num_tokens=16384,
+        )
+
+    if not skip_autotune:
+        with torch.inference_mode(), autotune(True):
             _ = fused_moe.cutlass_fused_moe(
                 hidden_states,
                 selected_experts.to(torch.int),
@@ -181,8 +171,9 @@ def bench_cutlass_fused_moe(
                 quant_scales=quant_scales,
                 input_sf=input_sf,
                 output=flash_output,
+                tune_max_num_tokens=16384,
             )
-    ms = do_bench(
+    ms_list = bench_gpu_time_with_cudagraph(
         lambda: fused_moe.cutlass_fused_moe(
             hidden_states,
             selected_experts.to(torch.int),
@@ -195,23 +186,44 @@ def bench_cutlass_fused_moe(
             output=flash_output,
         )
     )
+    avg_ms = sum(ms_list) / len(ms_list)
+    print(f"{'input':<15} {'weight1':<20} {'weight2':<20} {'time(ms)'}")
     print(
-        f"batch_size={batch_size}, num_experts={num_experts}, top_k={top_k}, intermediate_size={intermediate_size}"
+        f"{str(tuple(hidden_states.shape)):<15} {str(tuple(w1.shape)):<20} {str(tuple(w2.shape)):<20} {avg_ms:.3f}"
     )
-    print(f"execution time: {ms}ms")
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--update-config",
+        action="store_true",
+        help="Update the config file with the new profiling results",
+    )
+    parser.add_argument(
+        "--num-tokens", type=int, default=32, help="Number of tokens to profile"
+    )
+    parser.add_argument("--skip-autotune", action="store_true", help="Skip autotuning")
+    args = parser.parse_args()
+    AutoTuner.get().clear_cache()
+
     for config in test_configs:
-        hidden_size = config["hidden_size"]
-        num_experts = config["num_experts"]
-        top_k = config["top_k"]
-        intermediate_size = config["intermediate_size"]
-        for batch_size in BATCH_SIZES:
-            bench_cutlass_fused_moe(
-                batch_size,
-                hidden_size,
-                num_experts,
-                top_k,
-                intermediate_size,
-            )
+        bench_cutlass_fused_moe(
+            args.num_tokens,
+            config["hidden_size"],
+            config["num_experts"],
+            config["top_k"],
+            config["intermediate_size"],
+            args.skip_autotune,
+        )
+
+    configs = AutoTuner.get().profiling_cache
+    if args.update_config and configs:
+        # The original key contains a runner's hash in k[2] which might be different across machines.
+        # So, we remove it for now. v[0] and v[1] are the runner id and the tactic.
+        converted = {str((k[0], k[1], k[3])): (v[0], v[1]) for k, v in configs.items()}
+        config_path = get_config_path(is_module=False)
+        with open(config_path, "w") as f:
+            f.write("best_configs = ")
+            pprint.pprint(converted, stream=f)
+        print(f"Saved the cache to {config_path}")
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -1,21 +1,39 @@
 import contextlib
 import copy
+import importlib
 import inspect
 import itertools
+import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Any, Callable, Dict, List, Set, Tuple, Union
 
 import torch
 
+from flashinfer import __version__ as flashinfer_version
+
 # from tensorrt_llm.bindings.internal.runtime import delay_kernel
 # from tensorrt_llm.logger import logger
 from flashinfer.tllm_utils import delay_kernel
 
 from .jit.core import logger
 
 
+def get_config_path(is_module: bool):
+    dev_name = torch.cuda.get_device_name(0).replace(" ", "_")
+    fi_ver = flashinfer_version.replace(".", "_")
+    config_name = f"v{fi_ver}_trtllm_fused_moe_{dev_name}"
+    if is_module:
+        return f"flashinfer.tuning_configs.{config_name}"
+    else:
+        return os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "tuning_configs",
+            config_name + ".py",
+        )
+
+
 @dataclass(slots=True, unsafe_hash=True)
 class DynamicTensorSpec:
     """
@@ -265,6 +283,25 @@ def __str__(self) -> str:
         return stats_str
 
 
+@lru_cache(maxsize=None)
+def load_from_file(key):
+    module_name = get_config_path(is_module=True)
+    try:
+        module = importlib.import_module(module_name)
+        best_configs = module.best_configs
+    except (ImportError, AttributeError):
+        best_configs = None
+    if best_configs is not None:
+        k = str((key[0], key[1], key[3]))
+        if k in best_configs:
+            logger.info(f"[Autotuner]: Loading configs for {k} from file.")
+            return True, best_configs[k][0], best_configs[k][1], None
+    logger.info(
+        f"[Autotuner]: Loading configs for {key} from file failed; Using default configs instead."
+    )
+    return False, 0, -1, None
+
+
 class AutoTuner:
     """AutoTuner for optimizing TensorRT-LLM operations.
 
@@ -316,11 +353,16 @@ def search_cache(
             [is_cache_hit, runner_id, tactic, stored_profile]
         """
         for r in runners:
+            cache_key = AutoTuner._get_cache_key(
+                custom_op, r, input_shapes, tuning_config
+            )
             if (
-                cache_key := AutoTuner._get_cache_key(
-                    custom_op, r, input_shapes, tuning_config
-                )
-            ) in self.profiling_cache:
+                os.environ.get("FLASHINFER_AUTOTUNER_LOAD_FROM_FILE", "0") == "1"
+                and not self.is_tuning_mode
+            ):
+                output = load_from_file(cache_key)
+                return output
+            elif cache_key in self.profiling_cache:
                 return True, *self.profiling_cache[cache_key]
 
         return False, 0, -1, None
diff --git a/flashinfer/fused_moe.py b/flashinfer/fused_moe.py
@@ -765,7 +765,7 @@ def cutlass_fused_moe(
         use_w4a8_group_scaling=use_w4a8_group_scaling,
         use_mxfp8_act_scaling=use_mxfp8_act_scaling,
         min_latency_mode=min_latency_mode,
-        tune_max_num_tokens=8192,
+        tune_max_num_tokens=tune_max_num_tokens,
     )
 
 
diff --git a/flashinfer/tuning_configs/v0_2_8_trtllm_fused_moe_NVIDIA_B200.py b/flashinfer/tuning_configs/v0_2_8_trtllm_fused_moe_NVIDIA_B200.py
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -765,7 +765,7 @@ def cutlass_fused_moe(`
`765`	`765`	`use_w4a8_group_scaling=use_w4a8_group_scaling,`
`766`	`766`	`use_mxfp8_act_scaling=use_mxfp8_act_scaling,`
`767`	`767`	`min_latency_mode=min_latency_mode,`
`768`		`- tune_max_num_tokens=8192,`
	`768`	`+ tune_max_num_tokens=tune_max_num_tokens,`
`769`	`769`	`)`
`770`	`770`
`771`	`771`