flashinfer-ai · yzh119 · Aug 19, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 14, 2025
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -881,11 +881,10 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
     TORCH_CHECK(hidden_states_scale.value().scalar_type() == at::ScalarType::Float8_e4m3fn,
                 "hidden_states_scale must be fp8.");
 
-    TORCH_CHECK(hidden_states_scale.value().dim() == 1, "hidden_states_scale must be 1D.");
-    TORCH_CHECK(hidden_states_scale.value().sizes()[0] ==
-                    tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens,
-                                                               args.hidden_size / sf_vec_size),
-                "hidden_states_scale has incorrect size");
+    TORCH_CHECK(
+        hidden_states_scale.value().numel() == tensorrt_llm::computeFP4LinearLayoutSFSize(
+                                                   args.num_tokens, args.hidden_size / sf_vec_size),
+        "hidden_states_scale has incorrect size");
   }
 
   TORCH_CHECK(gemm1_weights.scalar_type() == torch_ext::FLOAT4_E2M1X2,
@@ -1059,7 +1058,8 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
     std::optional<int64_t> n_group, std::optional<int64_t> topk_group, int64_t intermediate_size,
     int64_t local_expert_offset, int64_t local_num_experts,
     std::optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
-    int64_t routing_method_type, bool do_finalize, bool enable_pdl, at::Tensor& output) {
+    int64_t routing_method_type, bool do_finalize, bool enable_pdl, at::Tensor& output,
+    int64_t config_index) {
   using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
 
   int const num_tokens = hidden_states.sizes()[0];
@@ -1112,8 +1112,10 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
       mDtypeAct, mDtypeWeights, mUseDeepSeekFp8, (int32_t)tile_tokens_dim,
       tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
 
-  auto const moeConfigIndex = mRunner->getDefaultValidConfigIndex(
-      top_k, hidden_size, intermediate_size, local_num_experts, num_tokens);
+  if (config_index == -1) {
+    config_index = mRunner->getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
+                                                       local_num_experts, num_tokens);
+  }
 
   return trtllm_fp4_block_scale_moe_launcher(
       routing_logits, topk_ids, expert_weights, routing_bias, hidden_states, hidden_states_scale,
@@ -1122,7 +1124,84 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
       output1_scales_gate_scalar, output2_scales_scalar, num_experts, top_k, n_group, topk_group,
       intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor,
       tile_tokens_dim, routing_method_type, do_finalize, *mRunner, mDtypeAct, mDtypeWeights,
-      moeConfigIndex, enable_pdl, output);
+      config_index, enable_pdl, output);
+}
+
+inline btg::Dtype get_dtype(int64_t const dtype) {
+  switch (dtype) {
+    case 0:
+      return btg::Dtype::Bfloat16;
+    case 1:
+      return btg::Dtype::Bool;
+    case 2:
+      return btg::Dtype::E2m1;
+    case 3:
+      return btg::Dtype::E2m3;
+    case 4:
+      return btg::Dtype::E3m2;
+    case 5:
+      return btg::Dtype::E4m3;
+    case 6:
+      return btg::Dtype::E5m2;
+    case 7:
+      return btg::Dtype::Fp16;
+    case 8:
+      return btg::Dtype::Fp32;
+    case 9:
+      return btg::Dtype::Int8;
+    case 10:
+      return btg::Dtype::Int32;
+    case 11:
+      return btg::Dtype::Int64;
+    case 12:
+      return btg::Dtype::MxE2m1;
+    case 13:
+      return btg::Dtype::MxE4m3;
+    case 14:
+      return btg::Dtype::UE8m0;
+    case 15:
+      return btg::Dtype::UInt8;
+    case 16:
+      return btg::Dtype::UInt16;
+    case 17:
+      return btg::Dtype::UInt32;
+    case 18:
+      return btg::Dtype::UInt64;
+    case 19:
+      return btg::Dtype::UInt128;
+    case 20:
+      return btg::Dtype::Void;
+    default:
+      TORCH_CHECK(false, "Invalid trtllm-gen dtype");
+  }
+  return btg::Dtype::E2m1;
+}
+
+int64_t trtllm_get_default_moe_configs(int64_t const tile_tokens_dim, int64_t const dtype_act_,
+                                       int64_t const dtype_weights_, bool const useDeepSeekFp8,
+                                       int64_t const top_k, int64_t const hidden_size,
+                                       int64_t const intermediate_size,
+                                       int64_t const num_local_experts, int64_t const num_tokens) {
+  auto dtype_act = get_dtype(dtype_act_);
+  auto dtype_weights = get_dtype(dtype_weights_);
+  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
+      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
+      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+  return moe_runner.getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
+                                               num_local_experts, num_tokens);
+}
+
+std::vector<int64_t> trtllm_get_valid_moe_configs(
+    int64_t const tile_tokens_dim, int64_t const dtype_act_, int64_t const dtype_weights_,
+    bool const useDeepSeekFp8, int64_t const top_k, int64_t const hidden_size,
+    int64_t const intermediate_size, int64_t const num_local_experts, int64_t const num_tokens) {
+  auto dtype_act = get_dtype(dtype_act_);
+  auto dtype_weights = get_dtype(dtype_weights_);
+  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
+      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
+      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+  return moe_runner.getValidConfigIndices(top_k, hidden_size, intermediate_size, num_local_experts,
+                                          num_tokens);
 }
 
 namespace trtllm_cubin_loader {
@@ -1133,6 +1212,8 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("trtllm_fp8_per_tensor_scale_moe", trtllm_fp8_per_tensor_scale_moe);
   m.def("trtllm_fp8_block_scale_moe", trtllm_fp8_block_scale_moe);
   m.def("trtllm_fp4_block_scale_moe", trtllm_fp4_block_scale_moe);
+  m.def("trtllm_get_default_moe_configs", trtllm_get_default_moe_configs);
+  m.def("trtllm_get_valid_moe_configs", trtllm_get_valid_moe_configs);
 }
 
 }  // namespace flashinfer
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -7,7 +7,7 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, Callable, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Set, Tuple, Union, Optional
 
 import torch
 
@@ -37,21 +37,49 @@ def get_config_path(is_module: bool):
         )
 
 
-@dataclass(slots=True, unsafe_hash=True)
+@dataclass(slots=True)
 class DynamicTensorSpec:
     """
     A specification for a dynamic tensor dimension.
     Args:
-        input_idx: The index of the input tensor.
-        dim_idx: The index of the dimension to tune.
+        input_idx: A list of the indices of the input tensors.
+        dim_idx: A list of the indices of the dimensions to tune.
+            The length of input_idx and dim_idx must be the same.
+            For every tensor mapped to the input_idx, their dimension mapped to the dim_idx must be the same.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
         map_to_tuning_buckets: A function to map dimensions to valid values during inference.
+        tensor_initializers: A list of functions to initialize the tensors.
     """
 
-    input_idx: int
-    dim_idx: int
-    gen_tuning_buckets: Union[Tuple[int], Callable]
+    input_idx: Tuple[int, ...]
+    dim_idx: Tuple[int, ...]
+    gen_tuning_buckets: Union[Tuple[int, ...], Callable]
     map_to_tuning_buckets: Callable
+    tensor_initializers: List[Callable] = field(default_factory=lambda: None)
+
+    def __post_init__(self):
+        # Set default tensor_initializers if not provided
+        if self.tensor_initializers is None:
+            self.tensor_initializers = [
+                lambda shapes, dtype, device: torch.randn(
+                    shapes, device=device, dtype=dtype
+                )
+                for _ in range(len(self.input_idx))
+            ]
+
+    def __hash__(self) -> int:
+        # FIXME: currently not hasing tensor_initializers
+        return hash(
+            (
+                self.input_idx,
+                self.dim_idx,
+                # For gen_tuning_buckets, only hash if it's a tuple, otherwise hash its id
+                self.gen_tuning_buckets
+                if isinstance(self.gen_tuning_buckets, tuple)
+                else id(self.gen_tuning_buckets),
+                id(self.map_to_tuning_buckets),
+            )
+        )
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -85,8 +113,8 @@ class TuningConfig:
                 >>> config = TuningConfig(
                 ...     dynamic_tensor_specs=(
                 ...         DynamicTensorSpec(
-                ...             input_idx=0,
-                ...             dim_idx=1,
+                ...             input_idx=[0],
+                ...             dim_idx=[1],
                 ...             gen_tuning_buckets=(32, 64, 128),
                 ...             map_to_tuning_buckets=lambda x: ((x + 31) // 32) * 32
                 ...         ),
@@ -141,6 +169,7 @@ class OptimizationProfile:
     """Ranges of all tensors, all dimension"""
 
     shapes: List[List[Dim]]
+    tensor_initializers: List[Optional[Callable]]
 
     def get_hash_key(self):
         return self.get_opt_shapes()
@@ -190,11 +219,10 @@ def __call__(self, inputs, **kwargs):
     @abstractmethod
     def forward(
         self,
-        /,  # tensors are position only
         inputs: List[torch.Tensor],
-        *,  # all others are keyword args only
         tactic: int = -1,
         do_preparation: bool = False,
+        **kwargs,  # all others are keyword args only
     ) -> Any:
         """Forward pass for tunable runners.
 
@@ -426,7 +454,7 @@ def choose_one(
             "All Given runners must be subclass of TunableRunner"
         )
 
-        profiles = self._optimization_profiles(tuning_config, inputs)
+        profiles = self._generate_optimization_profiles(tuning_config, inputs)
         # Record the total configs to try
         self.stats.tuned_op_total_configs[custom_op] = len(profiles)
 
@@ -532,7 +560,8 @@ def _profile_single_kernel(
         # Delay the profiled kernel launch to eliminate affects of host time overhead in profiling.
         # TODO: This is build time sensitive, O(tactic_num * impl_num * num_profile * tunable_ops)
         # Consider apply a preprofiling to estimate the kernel execution time, then decide the necessity.
-        delay_kernel(self.stream_delay_micro_secs)
+        if self.stream_delay_micro_secs > 0:
+            delay_kernel(self.stream_delay_micro_secs)
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
 
@@ -551,7 +580,7 @@ def _profile_single_kernel(
 
         return avg_time
 
-    def _optimization_profiles(
+    def _generate_optimization_profiles(
         self, tuning_config: TuningConfig, inputs: List[torch.Tensor]
     ) -> List[OptimizationProfile]:
         """Generate optimization profiles for autotuning.
@@ -579,7 +608,8 @@ def _optimization_profiles(
                     else [StaticDim(0)]
                 )
                 for t in inputs
-            ]
+            ],
+            [None] * len(inputs),
         )
 
         generated_profiles: List[OptimizationProfile] = []
@@ -592,9 +622,18 @@ def _optimization_profiles(
             ), (
                 "The given dynamic dimension must provide a opt value generation function or a list of opt values"
             )
+            assert len(spec.input_idx) == len(spec.dim_idx), (
+                f"The number of input indices and dimension indices must be the same, got {len(spec.input_idx)} and {len(spec.dim_idx)}"
+            )
+            assert len(spec.tensor_initializers) == len(spec.input_idx), (
+                f"The number of tensor initializers and input indices must be the same, got {len(spec.tensor_initializers)} and {len(spec.input_idx)}"
+            )
+            for i, idx in enumerate(spec.input_idx):
+                base_profile.tensor_initializers[idx] = spec.tensor_initializers[i]
+
             if inspect.isfunction(spec.gen_tuning_buckets):
                 opt_shapes = spec.gen_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx]._opt()
+                    base_profile.shapes[spec.input_idx[0]][spec.dim_idx[0]]._opt()
                 )
             else:
                 opt_shapes = spec.gen_tuning_buckets
@@ -617,9 +656,10 @@ def _optimization_profiles(
                 # TODO: fix me, how to set the min and max?
                 min_value = opt_value
                 max_value = opt_shapes_max[opt_value]
-                p.shapes[input_idx][dim_idx] = DynamicDim(
-                    min_value, opt_value, max_value
-                )
+                for i in range(len(input_idx)):
+                    p.shapes[input_idx[i]][dim_idx[i]] = DynamicDim(
+                        min_value, opt_value, max_value
+                    )
 
             # Adjust the profile to satisfy the constraints
             for constraint_spec in tuning_config.constraint_specs:
@@ -653,14 +693,15 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in tuning_config.dynamic_tensor_specs:
-            base_profile[spec.input_idx][spec.dim_idx] = spec.map_to_tuning_buckets(
-                base_profile[spec.input_idx][spec.dim_idx]
+            base_profile[spec.input_idx[0]][spec.dim_idx[0]] = (
+                spec.map_to_tuning_buckets(
+                    base_profile[spec.input_idx[0]][spec.dim_idx[0]]
+                )
             )
 
         # associated dimensions dependent on other free dynamic dimensions, so assign -1 in the profile
         for constraint_spec in tuning_config.constraint_specs:
             base_profile[constraint_spec.input_idx][constraint_spec.dim_idx] = -1
-
         return tuple(tuple(shape) for shape in base_profile)
 
     @classmethod
@@ -679,7 +720,7 @@ def _get_cache_key(
         )
 
     def _create_tensor_like(
-        self, origin_tensor: torch.Tensor, dims: List[Dim]
+        self, origin_tensor: torch.Tensor, dims: List[Dim], initializer: Callable
     ) -> torch.Tensor:
         """Create a new tensor matching the properties of the original tensor.
 
@@ -704,18 +745,22 @@ def _create_tensor_like(
                 # TODO: how to make sure the created Tensor has the min/max info
                 assert isinstance(d, DynamicDim)
                 shapes.append(d.opt)
-        # TODO: FIXME, sometimes the content of the tensor can affect the performance, like MOE
-        # One solution is to manituplate the tensor content to make it more like the real data
-        # during the tuning process. This can by controlled in the preparation phase by the runner.
-        return torch.zeros(shapes, dtype=dtype, device=device)
+        return initializer(shapes, dtype, device)
 
     def _prepare_input_tensors(
         self, profile: OptimizationProfile, inputs: List[torch.Tensor]
     ) -> List[torch.Tensor]:
+        default_initializer = lambda shapes, dtype, device: torch.rand(
+            shapes, device=device
+        ).to(dtype)
         tensors = []
         for i, p in enumerate(profile.shapes):
             if any(isinstance(d, DynamicDim) for d in p):
-                tensor = self._create_tensor_like(inputs[i], p)
+                tensor = self._create_tensor_like(
+                    inputs[i],
+                    p,
+                    profile.tensor_initializers[i] or default_initializer,
+                )
             else:
                 tensor = inputs[i]
             tensors.append(tensor)