upd

IwakuraRein · IwakuraRein · commit c4270bbc409d · 2025-08-13T15:24:29.000-07:00
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -921,28 +921,28 @@ class MoERunner(TunableRunner):
         dynamic_tensor_initializers = [
             lambda shapes, dtype, device: torch.empty(
                 shapes, device=device, dtype=dtype
-            ),  # output buffer
+            ),  # output buffer, [num_tokens, hidden_size]
             lambda shapes, dtype, device: torch.rand(
                 shapes, device=device, dtype=dtype
-            ),  # routing_logits
+            ),  # routing_logits, [num_tokens, num_experts]
             lambda shapes, dtype, device: torch.empty(
                 shapes, device=device, dtype=dtype
-            ),  # topk_ids buffer. empty since routing_logits is used
+            ),  # topk_ids buffer. empty since routing_logits is used. [num_tokens, topk]
             lambda shapes, dtype, device: torch.empty(
                 shapes, device=device, dtype=dtype
-            ),  # expert_weights buffer. empty since routing_logits is used
+            ),  # expert_weights buffer. empty since routing_logits is used. [num_tokens, topk]
             lambda shapes, dtype, device: torch.randn(shapes, device=device).to(
                 dtype
-            ),  # hidden_states
+            ),  # hidden_states, [num_tokens, hidden_size]
             lambda shapes, dtype, device: torch.ones(shapes, device=device).to(
                 dtype
-            ),  # hidden_states_scale
+            ),  # hidden_states_scale, [num_tokens, hidden_size // sf_vec_size]
         ]
         # their first dimension is num_tokens which will be tuned
         tuning_config_with_hidden_states_scales = TuningConfig(
             dynamic_tensor_specs=(
                 DynamicTensorSpec(
-                    (0, 1, 2, 3, 4, 7),
+                    (0, 1, 2, 3, 4, 5),
                     (0, 0, 0, 0, 0, 0),
                     get_last_power_of_2_num_tokens_buckets(8192),
                     lambda x: min(last_positive_power_of_2(x), 8192),
@@ -972,6 +972,8 @@ def __init__(
             dtype_act: DtypeTrtllmGen,
             dtype_weights: DtypeTrtllmGen,
             use_deepseek_fp8: bool,
+            hidden_size: int,
+            intermediate_size: int,
             tile_tokens_dim: Optional[int] = None,
             tune_max_num_tokens: int = 8192,
         ):
@@ -981,6 +983,8 @@ def __init__(
             self.dtype_weights = dtype_weights
             self.use_deepseek_fp8 = use_deepseek_fp8
             self.top_k = top_k
+            self.hidden_size = hidden_size
+            self.intermediate_size = intermediate_size
             self.tile_tokens_dim = tile_tokens_dim
 
         def get_tile_tokens_dim(self, num_tokens: int, top_k: int):
@@ -1016,17 +1020,8 @@ def get_valid_tactics(
                 topk_ids,
                 expert_weights,
                 hidden_states,
-                gemm1_weights,
-                gemm2_weights,
                 *extra_inputs,
             ) = inputs
-            hidden_size = hidden_states.shape[1]
-            if (
-                self.dtype_act == DtypeTrtllmGen.E2m1
-                or self.dtype_act == DtypeTrtllmGen.MxE2m1
-            ):  # packed into uint8
-                hidden_size *= 2
-            intermediate_size = gemm1_weights.shape[1] // 2
             num_tokens = routing_logits.shape[0]
             tile_tokens_dim = (
                 self.get_tile_tokens_dim(num_tokens, self.top_k)
@@ -1039,8 +1034,8 @@ def get_valid_tactics(
                 self.dtype_weights,
                 self.use_deepseek_fp8,
                 self.top_k,
-                hidden_size,
-                intermediate_size,
+                self.hidden_size,
+                self.intermediate_size,
                 self.num_experts,
                 num_tokens,
             )
@@ -1053,24 +1048,25 @@ def get_valid_tactics(
         def forward(
             self,
             inputs: List[torch.Tensor],
-            hidden_size: int,
-            intermediate_size: int,
             num_local_experts: int,
-            num_tokens: int,
-            routing_bias: Optional[torch.Tensor] = None,
-            gemm1_bias: Optional[torch.Tensor] = None,
-            gemm1_alpha: Optional[torch.Tensor] = None,
-            gemm1_beta: Optional[torch.Tensor] = None,
-            gemm1_clamp_limit: Optional[torch.Tensor] = None,
-            gemm2_bias: Optional[torch.Tensor] = None,
-            output1_scale_scalar: Optional[torch.Tensor] = None,
-            output1_scale_gate_scalar: Optional[torch.Tensor] = None,
-            output2_scale_scalar: Optional[torch.Tensor] = None,
-            n_group: Optional[int] = None,
-            topk_group: Optional[int] = None,
-            local_expert_offset: int = 0,
-            routed_scaling_factor: Optional[float] = None,
-            routing_method_type: int = 1,
+            routing_bias: Optional[torch.Tensor],
+            gemm1_weights: torch.Tensor,
+            gemm1_weights_scale: Optional[torch.Tensor],
+            gemm1_bias: Optional[torch.Tensor],
+            gemm1_alpha: Optional[torch.Tensor],
+            gemm1_beta: Optional[torch.Tensor],
+            gemm1_clamp_limit: Optional[torch.Tensor],
+            gemm2_weights: torch.Tensor,
+            gemm2_weights_scale: Optional[torch.Tensor],
+            gemm2_bias: Optional[torch.Tensor],
+            output1_scale_scalar: Optional[torch.Tensor],
+            output1_scale_gate_scalar: Optional[torch.Tensor],
+            output2_scale_scalar: Optional[torch.Tensor],
+            n_group: Optional[int],
+            topk_group: Optional[int],
+            local_expert_offset: int,
+            routed_scaling_factor: Optional[float],
+            routing_method_type: int,
             tactic: int = -1,
             do_preparation: bool = False,
         ):
@@ -1080,10 +1076,9 @@ def forward(
                 topk_ids,
                 expert_weights,
                 hidden_states,
-                gemm1_weights,
-                gemm2_weights,
                 *extra_inputs,
             ) = inputs
+            num_tokens = routing_logits.shape[0]
             tile_tokens_dim = (
                 self.get_tile_tokens_dim(num_tokens, self.top_k)
                 if self.tile_tokens_dim is None
@@ -1092,19 +1087,27 @@ def forward(
 
             extra_input_idx = 0
             if trtllm_gen_dtype_has_scale(self.dtype_act):
-                hidden_states_scale = (
-                    extra_inputs[extra_input_idx].view(torch.float8_e4m3fn).reshape(-1)
-                )
+                hidden_states_scale = extra_inputs[extra_input_idx]
                 extra_input_idx += 1
             else:
                 hidden_states_scale = None
-            if trtllm_gen_dtype_has_scale(self.dtype_weights):
-                gemm1_weights_scale = extra_inputs[extra_input_idx]
-                gemm2_weights_scale = extra_inputs[extra_input_idx + 1]
-                extra_input_idx += 2
-            else:
-                gemm1_weights_scale = None
-                gemm2_weights_scale = None
+            # sanity checks to ensure that dynamic tensors have the correct shapes
+            assert output.shape[0] == num_tokens, (
+                "output's first dimension must be batch size."
+            )
+            assert topk_ids.shape[0] == num_tokens, (
+                "topk_ids's first dimension must be batch size."
+            )
+            assert expert_weights.shape[0] == num_tokens, (
+                "expert_weights's first dimension must be batch size."
+            )
+            assert hidden_states.shape[0] == num_tokens, (
+                "hidden_states's first dimension must be batch size."
+            )
+            assert (
+                hidden_states_scale is None
+                or hidden_states_scale.shape[0] == num_tokens
+            ), "hidden_states_scale's first dimension must be batch size"
 
             # TODO(siyuan): support fp8
             moe_op.trtllm_fp4_block_scale_moe(
@@ -1126,11 +1129,11 @@ def forward(
                 output1_scale_scalar,
                 output1_scale_gate_scalar,
                 output2_scale_scalar,
-                num_local_experts,
+                self.num_experts,
                 self.top_k,
                 n_group,
                 topk_group,
-                intermediate_size,
+                self.intermediate_size,
                 local_expert_offset,
                 num_local_experts,
                 routed_scaling_factor,
@@ -1147,7 +1150,7 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
             cls.tuning_config_with_hidden_states_scales = TuningConfig(
                 dynamic_tensor_specs=(
                     DynamicTensorSpec(
-                        (0, 1, 2, 3, 4, 7),
+                        (0, 1, 2, 3, 4, 5),
                         (0, 0, 0, 0, 0, 0),
                         get_last_power_of_2_num_tokens_buckets(tune_max_num_tokens),
                         lambda x: min(last_positive_power_of_2(x), tune_max_num_tokens),
@@ -1402,6 +1405,8 @@ def trtllm_fp4_block_scale_moe_op(
             dtype_act=dtype_act,
             dtype_weights=dtype_weights,
             use_deepseek_fp8=False,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
             tile_tokens_dim=tile_tokens_dim,
             tune_max_num_tokens=tune_max_num_tokens,
         )
@@ -1416,29 +1421,25 @@ def trtllm_fp4_block_scale_moe_op(
             topk_ids,
             expert_weights,
             hidden_states,
-            gemm1_weights,
-            gemm2_weights,
         ]
-        # hidden_states_scale should be in front of gemm1_weights_scale and gemm2_weights_scale
         if hidden_states_scale is not None:
             inputs.append(hidden_states_scale)
-        inputs.append(gemm1_weights_scale)
-        inputs.append(gemm2_weights_scale)
 
         _, tactic = tuner.choose_one(
             "flashinfer::trtllm_fp4_block_scale_moe",
             [moe_runner],
             tunning_config,
             inputs,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
             num_local_experts=num_experts,
-            num_tokens=num_tokens,
             routing_bias=routing_bias,
+            gemm1_weights=gemm1_weights,
+            gemm1_weights_scale=gemm1_weights_scale,
             gemm1_bias=gemm1_bias,
             gemm1_alpha=gemm1_alpha,
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=gemm1_clamp_limit,
+            gemm2_weights=gemm2_weights,
+            gemm2_weights_scale=gemm2_weights_scale,
             gemm2_bias=gemm2_bias,
             output1_scale_scalar=output1_scale_scalar,
             output1_scale_gate_scalar=output1_scale_gate_scalar,
diff --git a/include/flashinfer/trtllm/fused_moe/RoutingKernelTopK.cuh b/include/flashinfer/trtllm/fused_moe/RoutingKernelTopK.cuh
@@ -51,7 +51,8 @@ struct TopKRedType {
   static __host__ __device__ inline TypeCmp makeCmpVal(TypeExpW val, int32_t idx = 0) {
     auto valueBits = cub::Traits<TypeExpW>::TwiddleIn(
         reinterpret_cast<typename cub::Traits<TypeExpW>::UnsignedBits&>(val));
-    TypeCmp compactTmp = reinterpret_cast<TypeCmp&>(valueBits);
+    TypeCmp compactTmp;
+    memcpy(&compactTmp, &valueBits, sizeof(valueBits));
     compactTmp = (compactTmp << moveBits) | (0xFFFF & (maxIdx - idx));
     // Use 65535 minus idx to give higher priority to elements with smaller indices.
     return compactTmp;