WIP

IwakuraRein · IwakuraRein · commit f590bf2ca8a4 · 2025-08-12T16:50:12.000-07:00
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -1079,7 +1079,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
   auto mDtypeAct = btg::Dtype::Bfloat16;
   if (hidden_states.scalar_type() == torch_ext::FLOAT4_E2M1X2) {
     TORCH_CHECK(hidden_states_scale.has_value() &&
-                    hidden_states_scale.value().scalar_type() == at::ScalarType::Float8_e4m3fn,
+                    hidden_states_scale.value().scalar_type() == at::ScalarType::Byte,
                 "hidden_states_scale must be provided for fp4 activation.");
     if (hidden_states_scale_vec_size == 16) {
       mDtypeAct = btg::Dtype::E2m1;
@@ -1171,6 +1171,20 @@ inline btg::Dtype get_dtype(int64_t const dtype) {
   return btg::Dtype::E2m1;
 }
 
+int64_t trtllm_get_default_moe_configs(int64_t const tile_tokens_dim, int64_t const dtype_act_,
+                                       int64_t const dtype_weights_, bool const useDeepSeekFp8,
+                                       int64_t const top_k, int64_t const hidden_size,
+                                       int64_t const intermediate_size,
+                                       int64_t const num_local_experts, int64_t const num_tokens) {
+  auto dtype_act = get_dtype(dtype_act_);
+  auto dtype_weights = get_dtype(dtype_weights_);
+  tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner moe_runner(
+      dtype_act, dtype_weights, useDeepSeekFp8, (int32_t)tile_tokens_dim,
+      tensorrt_llm::kernels::ActType::SwiGlu, /*useShuffledMatrixA*/ true);
+  return moe_runner.getDefaultValidConfigIndex(top_k, hidden_size, intermediate_size,
+                                               num_local_experts, num_tokens);
+}
+
 std::vector<int64_t> trtllm_get_valid_moe_configs(
     int64_t const tile_tokens_dim, int64_t const dtype_act_, int64_t const dtype_weights_,
     bool const useDeepSeekFp8, int64_t const top_k, int64_t const hidden_size,
@@ -1192,6 +1206,7 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("trtllm_fp8_per_tensor_scale_moe", trtllm_fp8_per_tensor_scale_moe);
   m.def("trtllm_fp8_block_scale_moe", trtllm_fp8_block_scale_moe);
   m.def("trtllm_fp4_block_scale_moe", trtllm_fp4_block_scale_moe);
+  m.def("trtllm_get_default_moe_configs", trtllm_get_default_moe_configs);
   m.def("trtllm_get_valid_moe_configs", trtllm_get_valid_moe_configs);
 }
 
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -7,9 +7,10 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Any, Callable, Dict, List, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Set, Tuple, Union, Optional
 
 import torch
+from tqdm import tqdm
 
 # from tensorrt_llm.bindings.internal.runtime import delay_kernel
 # from tensorrt_llm.logger import logger
@@ -42,16 +43,20 @@ class DynamicTensorSpec:
     """
     A specification for a dynamic tensor dimension.
     Args:
-        input_idx: The index of the input tensor.
-        dim_idx: The index of the dimension to tune.
+        input_idx: A list of the indices of the input tensors.
+        dim_idx: A list of the indices of the dimensions to tune.
+            The length of input_idx and dim_idx must be the same.
+            For every tensor mapped to the input_idx, their dimension mapped to the dim_idx must be the same.
         gen_tuning_buckets: A tuple of values to try or a function generating values.
         map_to_tuning_buckets: A function to map dimensions to valid values during inference.
+        tensor_initializers: A list of functions to initialize the tensors.
     """
 
-    input_idx: int
-    dim_idx: int
+    input_idx: Tuple[int]
+    dim_idx: Tuple[int]
     gen_tuning_buckets: Union[Tuple[int], Callable]
     map_to_tuning_buckets: Callable
+    # tensor_initializers: Tuple[Callable] = field(default_factory=lambda: [lambda shapes, dtype, device: torch.randn(shapes, device=device, dtype=dtype)])
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -85,8 +90,8 @@ class TuningConfig:
                 >>> config = TuningConfig(
                 ...     dynamic_tensor_specs=(
                 ...         DynamicTensorSpec(
-                ...             input_idx=0,
-                ...             dim_idx=1,
+                ...             input_idx=[0],
+                ...             dim_idx=[1],
                 ...             gen_tuning_buckets=(32, 64, 128),
                 ...             map_to_tuning_buckets=lambda x: ((x + 31) // 32) * 32
                 ...         ),
@@ -426,7 +431,7 @@ def choose_one(
             "All Given runners must be subclass of TunableRunner"
         )
 
-        profiles = self._optimization_profiles(tuning_config, inputs)
+        profiles = self._generate_optimization_profiles(tuning_config, inputs)
         # Record the total configs to try
         self.stats.tuned_op_total_configs[custom_op] = len(profiles)
 
@@ -532,7 +537,8 @@ def _profile_single_kernel(
         # Delay the profiled kernel launch to eliminate affects of host time overhead in profiling.
         # TODO: This is build time sensitive, O(tactic_num * impl_num * num_profile * tunable_ops)
         # Consider apply a preprofiling to estimate the kernel execution time, then decide the necessity.
-        delay_kernel(self.stream_delay_micro_secs)
+        if self.stream_delay_micro_secs > 0:
+            delay_kernel(self.stream_delay_micro_secs)
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
 
@@ -551,7 +557,7 @@ def _profile_single_kernel(
 
         return avg_time
 
-    def _optimization_profiles(
+    def _generate_optimization_profiles(
         self, tuning_config: TuningConfig, inputs: List[torch.Tensor]
     ) -> List[OptimizationProfile]:
         """Generate optimization profiles for autotuning.
@@ -592,9 +598,12 @@ def _optimization_profiles(
             ), (
                 "The given dynamic dimension must provide a opt value generation function or a list of opt values"
             )
+            assert len(spec.input_idx) == len(spec.dim_idx), (
+                "The number of input indices and dimension indices must be the same"
+            )
             if inspect.isfunction(spec.gen_tuning_buckets):
                 opt_shapes = spec.gen_tuning_buckets(
-                    base_profile.shapes[spec.input_idx][spec.dim_idx]._opt()
+                    base_profile.shapes[spec.input_idx[0]][spec.dim_idx[0]]._opt()
                 )
             else:
                 opt_shapes = spec.gen_tuning_buckets
@@ -617,9 +626,10 @@ def _optimization_profiles(
                 # TODO: fix me, how to set the min and max?
                 min_value = opt_value
                 max_value = opt_shapes_max[opt_value]
-                p.shapes[input_idx][dim_idx] = DynamicDim(
-                    min_value, opt_value, max_value
-                )
+                for i in range(len(input_idx)):
+                    p.shapes[input_idx[i]][dim_idx[i]] = DynamicDim(
+                        min_value, opt_value, max_value
+                    )
 
             # Adjust the profile to satisfy the constraints
             for constraint_spec in tuning_config.constraint_specs:
@@ -653,14 +663,15 @@ def _find_nearest_profile(
         base_profile = list(list(shape) for shape in shapes)
 
         for spec in tuning_config.dynamic_tensor_specs:
-            base_profile[spec.input_idx][spec.dim_idx] = spec.map_to_tuning_buckets(
-                base_profile[spec.input_idx][spec.dim_idx]
+            base_profile[spec.input_idx[0]][spec.dim_idx[0]] = (
+                spec.map_to_tuning_buckets(
+                    base_profile[spec.input_idx[0]][spec.dim_idx[0]]
+                )
             )
 
         # associated dimensions dependent on other free dynamic dimensions, so assign -1 in the profile
         for constraint_spec in tuning_config.constraint_specs:
             base_profile[constraint_spec.input_idx][constraint_spec.dim_idx] = -1
-
         return tuple(tuple(shape) for shape in base_profile)
 
     @classmethod
@@ -679,7 +690,7 @@ def _get_cache_key(
         )
 
     def _create_tensor_like(
-        self, origin_tensor: torch.Tensor, dims: List[Dim]
+        self, origin_tensor: torch.Tensor, dims: List[Dim], initializer: Callable
     ) -> torch.Tensor:
         """Create a new tensor matching the properties of the original tensor.
 
@@ -704,26 +715,21 @@ def _create_tensor_like(
                 # TODO: how to make sure the created Tensor has the min/max info
                 assert isinstance(d, DynamicDim)
                 shapes.append(d.opt)
-        # TODO: FIXME, sometimes the content of the tensor can affect the performance, like MOE
-        # One solution is to manituplate the tensor content to make it more like the real data
-        # during the tuning process. This can by controlled in the preparation phase by the runner.
-        # return torch.zeros(shapes, dtype=dtype, device=device)
-        if dtype == torch.int8:
-            return torch.randint(0, 127, shapes, dtype=dtype, device=device)
-        elif dtype == torch.uint8:
-            return torch.randint(0, 255, shapes, dtype=dtype, device=device)
-        elif dtype == torch.int32:
-            return torch.randint(0, 1000000, shapes, dtype=dtype, device=device)
-        else:
-            return torch.randn(shapes, dtype=dtype, device=device)
+        return initializer(shapes, dtype, device)
 
     def _prepare_input_tensors(
         self, profile: OptimizationProfile, inputs: List[torch.Tensor]
     ) -> List[torch.Tensor]:
         tensors = []
         for i, p in enumerate(profile.shapes):
             if any(isinstance(d, DynamicDim) for d in p):
-                tensor = self._create_tensor_like(inputs[i], p)
+                tensor = self._create_tensor_like(
+                    inputs[i],
+                    p,
+                    lambda shapes, dtype, device: torch.rand(shapes, device=device).to(
+                        dtype
+                    ),
+                )
             else:
                 tensor = inputs[i]
             tensors.append(tensor)
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py