fix test_trtllm_gen_fused_moe.py

jiahanc · jiahanc · commit f5f1c555aa81 · 2025-10-07T20:05:38.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -169,7 +169,7 @@ void TrtllmGenBatchedGemmRunner::run(
   auto const configs = bmm.getBatchedGemmConfigs();
 
   auto const& config = configs[configIndex];
-
+  // std::cout << "Running GEMM with config: " << config.mFunctionName << std::endl;
   FLASHINFER_CHECK(numBatches > 0, "Batched GEMM requires numBatches > 0");
   if (!mOptions.staticBatch) {
     FLASHINFER_CHECK(totalNumPaddedTokens,
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -1345,6 +1345,7 @@ def trtllm_fp4_block_scale_moe_op(
         if hidden_states_scale is not None:
             inputs.append(hidden_states_scale)
 
+        print(f"fp4 block scale moe tunning start")
         _, tactic = tuner.choose_one(
             "flashinfer::trtllm_fp4_block_scale_moe",
             [moe_runner],
@@ -1373,7 +1374,7 @@ def trtllm_fp4_block_scale_moe_op(
             do_finalize=do_finalize,
             gated_act_type=gated_act_type,
         )
-
+        print(f"fp4 block scale moe tunning end with tactic {tactic}")
         # Call the C++ function for block scale MoE
         output = moe_op.trtllm_fp4_block_scale_moe(
             routing_logits,
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -17,7 +17,6 @@
 from abc import ABC, abstractmethod
 from enum import IntEnum
 from typing import Dict
-
 import pytest
 import torch
 from cuda.bindings import runtime
@@ -1839,7 +1838,7 @@ def cache_permute_indices():
 
 @pytest.mark.parametrize("num_tokens", [1, 8, 1024])
 @pytest.mark.parametrize("hidden_size", [1024, 8192])
-@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 384])
+@pytest.mark.parametrize("intermediate_size", [384, 768, 1024, 2048])
 @pytest.mark.parametrize(
     "moe_impl",
     [
@@ -2244,35 +2243,3 @@ def test_moe_quantization_classes(
         rtol=tolerances["rtol"],
         percent=tolerances["percent"],
     )
-
-
-if __name__ == "__main__":
-    # pytest.main([__file__, "-v"])
-    routing_config = {
-        "num_experts": 256,
-        "top_k": 8,
-        "padding": 8,
-        "n_groups": 8,
-        "top_k_groups": 4,
-        "routed_scaling": 2.5,
-        "has_routing_bias": True,
-        "routing_method_type": RoutingMethodType.DeepSeekV3,
-        "compatible_moe_impls": [
-            FP8BlockScaleMoe,
-        ],
-    }
-    weight_processing = {
-        "use_shuffled_weight": False,
-        "layout": WeightLayout.MajorK,
-        "compatible_moe_impls": [FP8BlockScaleMoe],
-    }
-    test_moe_quantization_classes(
-        num_tokens=4,
-        hidden_size=1024,
-        intermediate_size=1024,
-        moe_impl=FP8BlockScaleMoe(),
-        routing_config=routing_config,
-        weight_processing=weight_processing,
-        gated_act_type=GatedActType.SwiGlu,
-        cache_permute_indices=cache_permute_indices,
-    )