UPD

IwakuraRein · IwakuraRein · commit 82bca2471447 · 2025-08-13T10:36:29.000-07:00
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -876,11 +876,10 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
     TORCH_CHECK(hidden_states_scale.value().scalar_type() == at::ScalarType::Float8_e4m3fn,
                 "hidden_states_scale must be fp8.");
 
-    TORCH_CHECK(hidden_states_scale.value().dim() == 1, "hidden_states_scale must be 1D.");
-    TORCH_CHECK(hidden_states_scale.value().sizes()[0] ==
-                    tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens,
-                                                               args.hidden_size / sf_vec_size),
-                "hidden_states_scale has incorrect size");
+    TORCH_CHECK(
+        hidden_states_scale.value().numel() == tensorrt_llm::computeFP4LinearLayoutSFSize(
+                                                   args.num_tokens, args.hidden_size / sf_vec_size),
+        "hidden_states_scale has incorrect size");
   }
 
   TORCH_CHECK(gemm1_weights.scalar_type() == torch_ext::FLOAT4_E2M1X2,
@@ -1079,7 +1078,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe(
   auto mDtypeAct = btg::Dtype::Bfloat16;
   if (hidden_states.scalar_type() == torch_ext::FLOAT4_E2M1X2) {
     TORCH_CHECK(hidden_states_scale.has_value() &&
-                    hidden_states_scale.value().scalar_type() == at::ScalarType::Byte,
+                    hidden_states_scale.value().scalar_type() == at::ScalarType::Float8_e4m3fn,
                 "hidden_states_scale must be provided for fp4 activation.");
     if (hidden_states_scale_vec_size == 16) {
       mDtypeAct = btg::Dtype::E2m1;
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -10,7 +10,6 @@
 from typing import Any, Callable, Dict, List, Set, Tuple, Union, Optional
 
 import torch
-from tqdm import tqdm
 
 # from tensorrt_llm.bindings.internal.runtime import delay_kernel
 # from tensorrt_llm.logger import logger
@@ -38,7 +37,7 @@ def get_config_path(is_module: bool):
         )
 
 
-@dataclass(slots=True, unsafe_hash=True)
+@dataclass(slots=True)
 class DynamicTensorSpec:
     """
     A specification for a dynamic tensor dimension.
@@ -52,11 +51,37 @@ class DynamicTensorSpec:
         tensor_initializers: A list of functions to initialize the tensors.
     """
 
-    input_idx: Tuple[int]
-    dim_idx: Tuple[int]
-    gen_tuning_buckets: Union[Tuple[int], Callable]
+    input_idx: Tuple[int, ...]
+    dim_idx: Tuple[int, ...]
+    gen_tuning_buckets: Union[Tuple[int, ...], Callable]
     map_to_tuning_buckets: Callable
-    # tensor_initializers: Tuple[Callable] = field(default_factory=lambda: [lambda shapes, dtype, device: torch.randn(shapes, device=device, dtype=dtype)])
+    tensor_initializers: List[Callable] = field(
+        default_factory=lambda: None
+    )
+
+    def __post_init__(self):
+        # Set default tensor_initializers if not provided
+        if self.tensor_initializers is None:
+            self.tensor_initializers = [
+                lambda shapes, dtype, device: torch.randn(
+                    shapes, device=device, dtype=dtype
+                )
+                for _ in range(len(self.input_idx))
+            ]
+
+    def __hash__(self) -> int:
+        # FIXME: currently not hasing tensor_initializers
+        return hash(
+            (
+                self.input_idx,
+                self.dim_idx,
+                # For gen_tuning_buckets, only hash if it's a tuple, otherwise hash its id
+                self.gen_tuning_buckets
+                if isinstance(self.gen_tuning_buckets, tuple)
+                else id(self.gen_tuning_buckets),
+                id(self.map_to_tuning_buckets),
+            )
+        )
 
 
 @dataclass(slots=True, unsafe_hash=True)
@@ -146,6 +171,7 @@ class OptimizationProfile:
     """Ranges of all tensors, all dimension"""
 
     shapes: List[List[Dim]]
+    tensor_initializers: List[Optional[Callable]]
 
     def get_hash_key(self):
         return self.get_opt_shapes()
@@ -585,7 +611,8 @@ def _generate_optimization_profiles(
                     else [StaticDim(0)]
                 )
                 for t in inputs
-            ]
+            ],
+            [None] * len(inputs),
         )
 
         generated_profiles: List[OptimizationProfile] = []
@@ -599,8 +626,14 @@ def _generate_optimization_profiles(
                 "The given dynamic dimension must provide a opt value generation function or a list of opt values"
             )
             assert len(spec.input_idx) == len(spec.dim_idx), (
-                "The number of input indices and dimension indices must be the same"
+                f"The number of input indices and dimension indices must be the same, got {len(spec.input_idx)} and {len(spec.dim_idx)}"
             )
+            assert len(spec.tensor_initializers) == len(spec.input_idx), (
+                f"The number of tensor initializers and input indices must be the same, got {len(spec.tensor_initializers)} and {len(spec.input_idx)}"
+            )
+            for i, idx in enumerate(spec.input_idx):
+                base_profile.tensor_initializers[idx] = spec.tensor_initializers[i]
+
             if inspect.isfunction(spec.gen_tuning_buckets):
                 opt_shapes = spec.gen_tuning_buckets(
                     base_profile.shapes[spec.input_idx[0]][spec.dim_idx[0]]._opt()
@@ -720,15 +753,16 @@ def _create_tensor_like(
     def _prepare_input_tensors(
         self, profile: OptimizationProfile, inputs: List[torch.Tensor]
     ) -> List[torch.Tensor]:
+        default_initializer = lambda shapes, dtype, device: torch.rand(
+            shapes, device=device
+        ).to(dtype)
         tensors = []
         for i, p in enumerate(profile.shapes):
             if any(isinstance(d, DynamicDim) for d in p):
                 tensor = self._create_tensor_like(
                     inputs[i],
                     p,
-                    lambda shapes, dtype, device: torch.rand(shapes, device=device).to(
-                        dtype
-                    ),
+                    profile.tensor_initializers[i] or default_initializer,
                 )
             else:
                 tensor = inputs[i]
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -372,8 +372,8 @@ class MoERunner(TunableRunner):
         tuning_config = TuningConfig(
             dynamic_tensor_specs=(
                 DynamicTensorSpec(
-                    0,
-                    0,
+                    (0,),
+                    (0,),
                     get_last_power_of_2_num_tokens_buckets(8192),
                     lambda x: min(last_positive_power_of_2(x), 8192),
                 ),
@@ -946,7 +946,7 @@ class MoERunner(TunableRunner):
                     (0, 0, 0, 0, 0, 0),
                     get_last_power_of_2_num_tokens_buckets(8192),
                     lambda x: min(last_positive_power_of_2(x), 8192),
-                    # dynamic_tensor_initializers
+                    dynamic_tensor_initializers,
                 ),
             )
         )
@@ -957,7 +957,7 @@ class MoERunner(TunableRunner):
                     (0, 0, 0, 0, 0),
                     get_last_power_of_2_num_tokens_buckets(8192),
                     lambda x: min(last_positive_power_of_2(x), 8192),
-                    # dynamic_tensor_initializers[:5]
+                    dynamic_tensor_initializers[:5],
                 ),
             ),
         )
@@ -1057,6 +1057,19 @@ def forward(
             intermediate_size: int,
             num_local_experts: int,
             num_tokens: int,
+            routing_bias: Optional[torch.Tensor] = None,
+            gemm1_bias: Optional[torch.Tensor] = None,
+            gemm1_alpha: Optional[torch.Tensor] = None,
+            gemm1_beta: Optional[torch.Tensor] = None,
+            gemm1_clamp_limit: Optional[torch.Tensor] = None,
+            gemm2_bias: Optional[torch.Tensor] = None,
+            output1_scale_scalar: Optional[torch.Tensor] = None,
+            output1_scale_gate_scalar: Optional[torch.Tensor] = None,
+            output2_scale_scalar: Optional[torch.Tensor] = None,
+            n_group: Optional[int] = None,
+            topk_group: Optional[int] = None,
+            local_expert_offset: int = 0,
+            routed_scaling_factor: Optional[float] = None,
             routing_method_type: int = 1,
             tactic: int = -1,
             do_preparation: bool = False,
@@ -1098,34 +1111,34 @@ def forward(
                 routing_logits.to(torch.bfloat16),
                 topk_ids,
                 expert_weights,
-                None,  # routing_bias
+                routing_bias,
                 hidden_states,
-                hidden_states_scale.reshape(-1),  # hidden_states_scale
+                hidden_states_scale,  # hidden_states_scale
                 gemm1_weights,
                 gemm1_weights_scale,
-                None,  # gemm1_bias
-                None,  # gemm1_alpha
-                None,  # gemm1_beta
-                None,  # gemm1_clamp_limit
+                gemm1_bias,
+                gemm1_alpha,
+                gemm1_beta,
+                gemm1_clamp_limit,
                 gemm2_weights,
                 gemm2_weights_scale,
-                None,  # gemm2_bias
-                None,  # output1_scale_scalar
-                None,  # output1_scale_gate_scalar
-                None,  # output2_scale_scalar
+                gemm2_bias,
+                output1_scale_scalar,
+                output1_scale_gate_scalar,
+                output2_scale_scalar,
                 num_local_experts,
                 self.top_k,
-                None,  # n_group
-                None,  # topk_group
+                n_group,
+                topk_group,
                 intermediate_size,
-                0,  # local_expert_offset
+                local_expert_offset,
                 num_local_experts,
-                None,  # routed_scaling_factor
-                tile_tokens_dim,  # tile_tokens_dim
-                routing_method_type,  # routing_method_type
+                routed_scaling_factor,
+                tile_tokens_dim,
+                routing_method_type,
                 True,  # do_finalize
-                output,  # output
-                tactic,  # config_idx
+                output,
+                tactic,
             )
 
         @classmethod
@@ -1138,7 +1151,7 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
                         (0, 0, 0, 0, 0, 0),
                         get_last_power_of_2_num_tokens_buckets(tune_max_num_tokens),
                         lambda x: min(last_positive_power_of_2(x), tune_max_num_tokens),
-                        # cls.dynamic_tensor_initializers
+                        cls.dynamic_tensor_initializers,
                     ),
                 )
             )
@@ -1149,7 +1162,7 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
                         (0, 0, 0, 0, 0),
                         get_last_power_of_2_num_tokens_buckets(tune_max_num_tokens),
                         lambda x: min(last_positive_power_of_2(x), tune_max_num_tokens),
-                        # cls.dynamic_tensor_initializers[:5]
+                        cls.dynamic_tensor_initializers[:5],
                     ),
                 ),
             )
@@ -1378,69 +1391,64 @@ def trtllm_fp4_block_scale_moe_op(
             )
 
         tuner = AutoTuner.get()
-        if tuner.is_tuning_mode:
-            MoERunner.refine_tuning_config(tune_max_num_tokens)
-            dtype_act = deduce_trtllm_gen_tensor_dtype(
-                hidden_states, hidden_states_scale
-            )
-            dtype_weights = deduce_trtllm_gen_tensor_dtype(
-                gemm1_weights, gemm1_weights_scale
-            )
-            moe_runner = MoERunner(
-                top_k=top_k,
-                num_experts=num_experts,
-                dtype_act=dtype_act,
-                dtype_weights=dtype_weights,
-                use_deepseek_fp8=False,
-                tile_tokens_dim=tile_tokens_dim,
-                tune_max_num_tokens=tune_max_num_tokens,
-            )
-            tunning_config = (
-                MoERunner.tuning_config_no_hidden_states_scales
-                if hidden_states_scale is None
-                else MoERunner.tuning_config_with_hidden_states_scales
-            )
-            inputs = [
-                output,
-                routing_logits,
-                topk_ids,
-                expert_weights,
-                hidden_states,
-                gemm1_weights,
-                gemm2_weights,
-            ]
-            # hidden_states_scale should be in front of gemm1_weights_scale and gemm2_weights_scale
-            if hidden_states_scale is not None:
-                inputs.append(hidden_states_scale)
-            inputs.append(gemm1_weights_scale)
-            inputs.append(gemm2_weights_scale)
-
-            _, tactic = tuner.choose_one(
-                "flashinfer::trtllm_fp4_block_scale_moe",
-                [moe_runner],
-                tunning_config,
-                inputs,
-                hidden_size=hidden_size,
-                intermediate_size=intermediate_size,
-                num_local_experts=num_experts,
-                num_tokens=num_tokens,
-                routing_method_type=routing_method_type,
-            )
-            print(f"tactic: {tactic}")
-            default_tactic = moe_op.trtllm_get_default_moe_configs(
-                tile_tokens_dim,
-                dtype_act,
-                dtype_weights,
-                False,
-                top_k,
-                hidden_size,
-                intermediate_size,
-                num_experts,
-                num_tokens,
-            )
-            print(f"default_tactic: {default_tactic}")
-        else:
-            tactic = -1
+        MoERunner.refine_tuning_config(tune_max_num_tokens)
+        dtype_act = deduce_trtllm_gen_tensor_dtype(hidden_states, hidden_states_scale)
+        dtype_weights = deduce_trtllm_gen_tensor_dtype(
+            gemm1_weights, gemm1_weights_scale
+        )
+        moe_runner = MoERunner(
+            top_k=top_k,
+            num_experts=num_experts,
+            dtype_act=dtype_act,
+            dtype_weights=dtype_weights,
+            use_deepseek_fp8=False,
+            tile_tokens_dim=tile_tokens_dim,
+            tune_max_num_tokens=tune_max_num_tokens,
+        )
+        tunning_config = (
+            MoERunner.tuning_config_no_hidden_states_scales
+            if hidden_states_scale is None
+            else MoERunner.tuning_config_with_hidden_states_scales
+        )
+        inputs = [
+            output,
+            routing_logits,
+            topk_ids,
+            expert_weights,
+            hidden_states,
+            gemm1_weights,
+            gemm2_weights,
+        ]
+        # hidden_states_scale should be in front of gemm1_weights_scale and gemm2_weights_scale
+        if hidden_states_scale is not None:
+            inputs.append(hidden_states_scale)
+        inputs.append(gemm1_weights_scale)
+        inputs.append(gemm2_weights_scale)
+
+        _, tactic = tuner.choose_one(
+            "flashinfer::trtllm_fp4_block_scale_moe",
+            [moe_runner],
+            tunning_config,
+            inputs,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_local_experts=num_experts,
+            num_tokens=num_tokens,
+            routing_bias=routing_bias,
+            gemm1_bias=gemm1_bias,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_beta=gemm1_beta,
+            gemm1_clamp_limit=gemm1_clamp_limit,
+            gemm2_bias=gemm2_bias,
+            output1_scale_scalar=output1_scale_scalar,
+            output1_scale_gate_scalar=output1_scale_gate_scalar,
+            output2_scale_scalar=output2_scale_scalar,
+            n_group=n_group,
+            topk_group=topk_group,
+            local_expert_offset=local_expert_offset,
+            routed_scaling_factor=routed_scaling_factor,
+            routing_method_type=routing_method_type,
+        )
 
         # Call the C++ function for block scale MoE
         output = moe_op.trtllm_fp4_block_scale_moe(
@@ -1449,7 +1457,7 @@ def trtllm_fp4_block_scale_moe_op(
             expert_weights,
             routing_bias,
             hidden_states,
-            hidden_states_scale.reshape(-1),
+            hidden_states_scale,
             gemm1_weights,
             gemm1_weights_scale,
             gemm1_bias,
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py