minor

IwakuraRein · IwakuraRein · commit 35a55d95b286 · 2025-08-13T16:39:37.000-07:00
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -631,6 +631,7 @@ def _fake_cutlass_fused_moe_sm100(
         use_mxfp8_act_scaling: bool = False,
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
+        enable_pdl: Optional[bool] = None,
     ):
         seq_len = input.shape[0]
         hidden_size = fc2_expert_weights.shape[1]
@@ -1081,6 +1082,8 @@ def forward(
             local_expert_offset: int,
             routed_scaling_factor: Optional[float],
             routing_method_type: int,
+            enable_pdl: bool,
+            do_finalize: bool,
             tactic: int = -1,
             do_preparation: bool = False,
         ):
@@ -1153,7 +1156,8 @@ def forward(
                 routed_scaling_factor,
                 tile_tokens_dim,
                 routing_method_type,
-                True,  # do_finalize
+                enable_pdl,
+                do_finalize,
                 output,
                 tactic,
             )
@@ -1473,6 +1477,8 @@ def trtllm_fp4_block_scale_moe_op(
             local_expert_offset=local_expert_offset,
             routed_scaling_factor=routed_scaling_factor,
             routing_method_type=routing_method_type,
+            enable_pdl=enable_pdl,
+            do_finalize=do_finalize,
         )
 
         # Call the C++ function for block scale MoE
@@ -1544,6 +1550,9 @@ def _fake_trtllm_fp4_block_scale_moe(
         tile_tokens_dim: int,
         routing_method_type: int,
         do_finalize: bool,
+        enable_pdl: bool,
+        tune_max_num_tokens: int,
+        output: Optional[torch.Tensor],
     ):
         seq_len = hidden_states.shape[0]
         hidden_size = hidden_states.shape[1]
@@ -1730,6 +1739,7 @@ def trtllm_fp4_block_scale_moe(
     tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
     do_finalize: bool = True,
+    enable_pdl: Optional[bool] = None,
     tune_max_num_tokens: int = 1024,
     output: Optional[torch.Tensor] = None,
 ) -> List[torch.Tensor]:
@@ -1782,6 +1792,7 @@ def trtllm_fp4_block_scale_moe(
         List[torch.Tensor]: List of output tensors. If do_finalize=True, returns the final MoE output.
             Otherwise, returns intermediate results (gemm2_output, expert_weights, expanded_idx_to_permuted_idx) that need further processing.
     """
+    print(f"in trtllm_fp4_block_scale_moe, tune_max_num_tokens={tune_max_num_tokens}")
     return get_trtllm_moe_sm100_module().trtllm_fp4_block_scale_moe(
         routing_logits,
         None,
@@ -1812,6 +1823,7 @@ def trtllm_fp4_block_scale_moe(
         tile_tokens_dim,
         routing_method_type,
         do_finalize,
+        enable_pdl,
         tune_max_num_tokens,
         output,
     )