upd

IwakuraRein · IwakuraRein · commit e4fb80838ceb · 2025-08-13T15:55:35.000-07:00
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -944,8 +944,8 @@ class MoERunner(TunableRunner):
                 DynamicTensorSpec(
                     (0, 1, 2, 3, 4, 5),
                     (0, 0, 0, 0, 0, 0),
-                    get_last_power_of_2_num_tokens_buckets(8192),
-                    lambda x: min(last_positive_power_of_2(x), 8192),
+                    get_last_power_of_2_num_tokens_buckets(1024, 8),
+                    lambda x: min(last_positive_power_of_2(x), 1024),
                     dynamic_tensor_initializers,
                 ),
             )
@@ -955,8 +955,8 @@ class MoERunner(TunableRunner):
                 DynamicTensorSpec(
                     (0, 1, 2, 3, 4),
                     (0, 0, 0, 0, 0),
-                    get_last_power_of_2_num_tokens_buckets(8192),
-                    lambda x: min(last_positive_power_of_2(x), 8192),
+                    get_last_power_of_2_num_tokens_buckets(1024, 8),
+                    lambda x: min(last_positive_power_of_2(x), 1024),
                     dynamic_tensor_initializers[:5],
                 ),
             ),
@@ -975,7 +975,6 @@ def __init__(
             hidden_size: int,
             intermediate_size: int,
             tile_tokens_dim: Optional[int] = None,
-            tune_max_num_tokens: int = 8192,
         ):
             self.num_experts = num_experts
             self.top_k = top_k
@@ -1360,7 +1359,7 @@ def trtllm_fp4_block_scale_moe_op(
         tile_tokens_dim: int,
         routing_method_type: int,
         do_finalize: bool,
-        tune_max_num_tokens: int = 8192,
+        tune_max_num_tokens: int = 1024,
         output: Optional[torch.Tensor] = None,
     ) -> List[torch.Tensor]:
         if routing_logits is None:
@@ -1408,7 +1407,6 @@ def trtllm_fp4_block_scale_moe_op(
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             tile_tokens_dim=tile_tokens_dim,
-            tune_max_num_tokens=tune_max_num_tokens,
         )
         tunning_config = (
             MoERunner.tuning_config_no_hidden_states_scales
@@ -1701,7 +1699,7 @@ def trtllm_fp4_block_scale_moe(
     tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
     do_finalize: bool = True,
-    tune_max_num_tokens: int = 8192,
+    tune_max_num_tokens: int = 1024,
     output: Optional[torch.Tensor] = None,
 ) -> List[torch.Tensor]:
     """FP4 block scale MoE operation.
@@ -1745,6 +1743,7 @@ def trtllm_fp4_block_scale_moe(
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
         do_finalize (bool): Whether to finalize the output (default: False)
+        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 1024)
         output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
             Optional inplace output tensor.
 
@@ -1815,7 +1814,7 @@ def trtllm_fp4_block_scale_routed_moe(
     tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
     do_finalize: bool = True,
-    tune_max_num_tokens: int = 8192,
+    tune_max_num_tokens: int = 1024,
     output: Optional[torch.Tensor] = None,
 ) -> List[torch.Tensor]:
     """FP4 block scale MoE operation.
@@ -1861,6 +1860,7 @@ def trtllm_fp4_block_scale_routed_moe(
             - 3: Llama4 (Top1 -> Sigmoid)
             - 4: RenormalizeNaive (Softmax -> TopK -> Renormalize)
         do_finalize (bool): Whether to finalize the output (default: False)
+        tune_max_num_tokens(int): Maximum number of tokens for tuning. (default: 1024)
         output (Optional[torch.Tensor]): shape [seq_len, hidden_size]
             Optional inplace output tensor.
 
diff --git a/flashinfer/fused_moe/utils.py b/flashinfer/fused_moe/utils.py
@@ -203,11 +203,13 @@ def get_power_of_2_num_tokens_buckets(max_num_tokens) -> Tuple[int]:
     return tuple(num_token_buckets)
 
 
-def get_last_power_of_2_num_tokens_buckets(max_num_tokens) -> Tuple[int]:
+def get_last_power_of_2_num_tokens_buckets(
+    max_num_tokens, min_num_tokens=1
+) -> Tuple[int]:
     max_num_tokens = last_positive_power_of_2(max_num_tokens)
     num_token_buckets = []
     m = max_num_tokens
-    while m >= 1:
+    while m >= min_num_tokens:
         num_token_buckets.append(m)
         m //= 2
     return tuple(num_token_buckets)
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py
@@ -1733,9 +1733,9 @@ def cache_permute_indices():
 @pytest.mark.parametrize(
     "moe_impl",
     [
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4 x NvFP4"),
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4 x MxFP8"),
-        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4 x Bf16"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
+        pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
         pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
         pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
     ],