flashinfer-ai · aleozlx · Jul 21, 2025 · Jul 22, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/benchmarks/flashinfer_benchmark.py b/benchmarks/flashinfer_benchmark.py
@@ -4,6 +4,7 @@
 from routines.attention import parse_attention_args, run_attention_test
 from routines.flashinfer_benchmark_utils import full_output_columns, output_column_dict
 from routines.gemm import parse_gemm_args, run_gemm_test
+from routines.moe import parse_moe_args, run_moe_test
 
 
 def run_test(args):
@@ -26,6 +27,12 @@ def run_test(args):
         "group_gemm_fp8_nt_groupwise",
     ]:
         res = run_gemm_test(args)
+    elif args.routine in [
+        "trtllm_fp4_block_scale_moe",
+        "trtllm_fp8_block_scale_moe",
+        "trtllm_fp8_per_tensor_scale_moe",
+    ]:
+        res = run_moe_test(args)
     else:
         raise ValueError(f"Unsupported routine: {args.routine}")
 
@@ -69,6 +76,9 @@ def parse_args(line=sys.argv[1:]):
             "BatchPrefillWithRaggedKVCacheWrapper",
             "gemm_fp8_nt_groupwise",
             "group_gemm_fp8_nt_groupwise",
+            "trtllm_fp4_block_scale_moe",
+            "trtllm_fp8_block_scale_moe",
+            "trtllm_fp8_per_tensor_scale_moe",
         ],
     )
     args, _ = parser.parse_known_args(line[:])
@@ -133,6 +143,12 @@ def parse_args(line=sys.argv[1:]):
         "group_gemm_fp8_nt_groupwise",
     ]:
         args = parse_gemm_args(line, parser)
+    elif args.routine in [
+        "trtllm_fp4_block_scale_moe",
+        "trtllm_fp8_block_scale_moe",
+        "trtllm_fp8_per_tensor_scale_moe",
+    ]:
+        args = parse_moe_args(line, parser)
     else:
         raise ValueError(f"Unsupported routine: {args.routine}")
 

diff --git a/benchmarks/routines/flashinfer_benchmark_utils.py b/benchmarks/routines/flashinfer_benchmark_utils.py
@@ -33,6 +33,20 @@
         "out_dtype",
         "mma_sm",
     ],
+    "moe": [
+        "num_tokens",
+        "hidden_size",
+        "intermediate_size",
+        "num_experts",
+        "top_k",
+        "n_groups",
+        "top_k_groups",
+        "routing_method_type",
+        "routed_scaling_factor",
+        "tile_tokens_dim",
+        "use_shuffled_weight",
+        "weight_layout",
+    ],
     "general": [
         "refcheck",
         "no_cuda_graph",
@@ -45,5 +59,6 @@
     output_column_dict["perf"]
     + output_column_dict["attention"]
     + output_column_dict["gemm"]
+    + output_column_dict["moe"]
     + output_column_dict["general"]
 )