feat: Support Fused MoE non gated Relu2 NVFP4 & FP8 and support Nemotron #2304

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

yzh119 merged 25 commits into flashinfer-ai:main from amitz-nv:fused-moe-non-gated-fp8

Jan 30, 2026

benchmarks/bench_trtllm_gen_fused_moe_autotuner.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,7 @@ @@
     import numpy as np
     from flashinfer import (
         RoutingMethodType,
-        GatedActType,
+        ActivationType,
         fp4_quantize,
         mxfp8_quantize,
     )
@@ Expand All / @@ -17,6 +17,7 @@ @@
     from flashinfer.autotuner import autotune
     from flashinfer.testing.utils import bench_gpu_time
     from flashinfer.utils import device_support_pdl
+    from routines.flashinfer_benchmark_utils import enum_type
     FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
     FLOAT4_E2M1_MAX = 6.0
@@ Expand All / @@ -39,6 +40,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8( @@
         top_k: int,
         warmups: int,
         iterations: int,
+        activation_type: ActivationType,
     ):
         device = torch.device("cuda:0")
         enable_pdl = device_support_pdl(device)
@@ Expand Down Expand Up / @@ -97,6 +99,10 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8( @@
         )
         if is_block_scale:
+            if activation_type != ActivationType.Swiglu:
+                raise ValueError(
+                    "Only Swiglu activation is supported for FP8 block scale MoE."
+                )
             fn = lambda: trtllm_fp8_block_scale_moe(
                 routing_logits,
                 routing_bias,
@@ Expand Down Expand Up / @@ -144,6 +150,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp8( @@
                 RoutingMethodType.TopK.value,
                 enable_pdl,
                 num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
+                activation_type.value,
             )
         def bench(do_autotune):
@@ Expand Down Expand Up / @@ -175,6 +182,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4( @@
         top_k: int,
         warmups: int,
         iterations: int,
+        activation_type: ActivationType,
     ):
         device = torch.device("cuda:0")
         enable_pdl = device_support_pdl(device)
@@ Expand Down Expand Up / @@ -234,6 +242,10 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4( @@
             w13_global_scale = 1.0 / 448.0 / 6.0
             w2_global_scale = 1.0 / 448.0 / 6.0
         else:
+            if activation_type == ActivationType.Relu2:
+                raise ValueError(
+                    "Relu2 activation is supported for FP4 only with 'NvFP4xNvFP4' quant mode"
+                )
             w13, w13_scale = fp4_quantize(
                 w13, torch.tensor([1.0], device=device), sf_vec_size=32, sf_use_ue8m0=True
             )
@@ Expand Down Expand Up / @@ -288,7 +300,7 @@ def bench_trtllm_gen_fused_moe_autotuner_fp4( @@
             RoutingMethodType.Renormalize.value,
             True,
             enable_pdl,
-            GatedActType.SwiGlu.value,  # gated_act_type
+            activation_type.value,  # act_type
             None,
             num_tokens if tune_max_num_tokens is None else tune_max_num_tokens,
         )
@@ Expand Down Expand Up / @@ -348,6 +360,14 @@ def bench(do_autotune): @@
         parser.add_argument(
             "--iterations", type=int, default=100, help="Number of benchmark iterations"
         )
+        parser.add_argument(
+            "--activation-type",
+            type=enum_type(ActivationType),
+            metavar=str([e.name for e in ActivationType]),
+            required=False,
+            default=ActivationType.Swiglu,
+            help=f"Type of activation function: {[e.name for e in ActivationType]}",
+        )
         args = parser.parse_args()
         if args.quant_mode in ["Fp8-Per-Tensor", "Fp8-Block"]:
             bench_trtllm_gen_fused_moe_autotuner_fp8(
@@ Expand All / @@ -360,6 +380,7 @@ def bench(do_autotune): @@
                 args.top_k,
                 args.warmups,
                 args.iterations,
+                args.activation_type,
             )
         else:
             bench_trtllm_gen_fused_moe_autotuner_fp4(
@@ Expand All / @@ -372,4 +393,5 @@ def bench(do_autotune): @@
                 args.top_k,
                 args.warmups,
                 args.iterations,
+                args.activation_type,
             )

benchmarks/routines/flashinfer_benchmark_utils.py

-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    import argparse
     import torch
     from flashinfer.testing.utils import set_seed
@@ Expand Down Expand Up @@
                 f"[WARNING] {backend} for routine {routine} is not supported on compute capability {compute_capability}. Skipping."
             )
         return backends
+    def enum_type(enum_class):
+        """Generic factory for argparse enum types."""
+        def converter(value):
+            try:
+                lower_name_to_member = {m.name.lower(): m for m in enum_class}
+                return lower_name_to_member[value.lower()]
+            except KeyError as e:
+                raise argparse.ArgumentTypeError(
+                    f"Invalid value '{value}'. Must be one of: {', '.join([m.name for m in enum_class])}"
+                ) from e
+        return converter

benchmarks/routines/moe.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     import torch
     import flashinfer
+    from flashinfer import ActivationType
     from flashinfer.autotuner import autotune
     from flashinfer.fused_moe import (
         WeightLayout,
@@ Expand All / @@ -23,6 +24,7 @@ @@
     from .flashinfer_benchmark_utils import (
         dtype_str_to_torch_dtype,
+        enum_type,
         get_device,
         print_perf_metrics,
         filter_backends_by_compute_capability,
@@ Expand Down Expand Up / @@ -175,12 +177,12 @@ def parse_moe_args(line, parser): @@
             help="Data type of the weights (before quantization).",
         )
         parser.add_argument(
-            "--gated_act",
-            type=str,
+            "--activation-type",
+            type=enum_type(ActivationType),
+            metavar=str([e.name for e in ActivationType]),
             required=False,
-            default="swiglu",
-            choices=["swiglu", "geglu"],
-            help="Type of gated activation function: swiglu | geglu.",
+            default=ActivationType.Swiglu,
+            help=f"Type of activation function: {[e.name for e in ActivationType]}",
         )
         parser.add_argument(
             "--autotune",
@@ Expand Down Expand Up / @@ -247,13 +249,6 @@ def parse_moe_args(line, parser): @@
         }
         args.routing_method_type = routing_method_name_to_type[args.routing_method]
-        # Normalize gated act type (map string to internal int expected by kernels)
-        gated_act_name_to_type = {
-            "swiglu": 0,
-            "geglu": 1,
-        }
-        args.gated_act_type = gated_act_name_to_type[args.gated_act]
         if args.verbose >= 1:
             print(f"[INFO] {args = }")
         return args
@@ Expand Down Expand Up / @@ -630,7 +625,7 @@ def testTrtllmFp4BlockScaleMoe(args): @@
         use_shuffled_weight = args.use_shuffled_weight
         weight_layout = args.weight_layout
         is_cuda_graph_compatible = not args.no_cuda_graph
-        gated_act_type = args.gated_act_type
+        activation_type = args.activation_type
         res = []
         backends = ["trtllm"]
@@ Expand Down Expand Up / @@ -795,7 +790,7 @@ def run_fp4_moe( @@
                 local_num_experts=local_num_experts,
                 routed_scaling_factor=routed_scaling_factor,
                 routing_method_type=routing_method_type,
-                gated_act_type=gated_act_type,
+                activation_type=activation_type.value,
                 do_finalize=True,
             )
@@ Expand Down Expand Up / @@ -900,7 +895,7 @@ def run_fp4_moe( @@
             cur_res["use_routing_scales_on_input"] = args.use_routing_scales_on_input
             cur_res["input_dtype"] = input_dtype
             cur_res["weight_dtype"] = weight_dtype
-            cur_res["gated_act"] = args.gated_act
+            cur_res["activation_type"] = args.activation_type.name
             res.append(cur_res)
         return res
@@ Expand Down Expand Up / @@ -1671,6 +1666,7 @@ def run_fp8_per_tensor_moe( @@
             output1_scales_gate_scalar,
             gemm2_weights_fp8,
             output2_scales_scalar,
+            activation_type,
         ):
             # Note: FP8 per-tensor MOE expects int64_t for n_group/topk_group, not Optional[int64_t]
             # So we convert None to 0 to indicate "no groups" mode
@@ Expand All / @@ -1693,6 +1689,7 @@ def run_fp8_per_tensor_moe( @@
                 routed_scaling_factor=routed_scaling_factor,
                 use_routing_scales_on_input=use_routing_scales_on_input,
                 routing_method_type=routing_method_type,
+                activation_type=activation_type.value,
             )
         # Benchmark timing
@@ Expand All / @@ -1713,6 +1710,7 @@ def run_fp8_per_tensor_moe( @@
                 output1_scales_gate_scalar,
                 gemm2_weights_fp8,
                 output2_scales_scalar,
+                args.activation_type,
             ),
         )
@@ Expand Down Expand Up / @@ -1764,6 +1762,7 @@ def run_fp8_per_tensor_moe( @@
             cur_res["use_routing_scales_on_input"] = use_routing_scales_on_input
             cur_res["input_dtype"] = input_dtype
             cur_res["weight_dtype"] = weight_dtype
+            cur_res["activation_type"] = args.activation_type.name
             res.append(cur_res)
         return res

csrc/trtllm_batched_gemm_runner.cu

-Original file line number
+Diff line change
@@ Expand Up / @@ -101,14 +101,16 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner( @@
             options.mTransposeMmaOutput == mOptions.transposeMmaOutput &&
             (!doesRouteImplUseNoRoute(options.mRouteImpl)) == mOptions.routeAct &&
             options.mFusedAct == mOptions.fusedAct && options.mIsStaticBatch == mOptions.staticBatch &&
-            tileSize == mOptions.tileSize &&
-            options.mUseShuffledMatrix == mOptions.useShuffledMatrixA &&
+            tileSize == mOptions.tileSize && options.mUseShuffledMatrix == mOptions.useShuffledMatrix &&
             options.mLayoutA == mOptions.weightLayout) {
           if (options.mFusedAct) {
             if (options.mActType != static_cast<batchedGemm::gemmGatedAct::ActType>(mOptions.actType)) {
               continue;
             }
           }
+          if ((int64_t)options.mEltwiseActType != (int64_t)mOptions.eltwiseActType) {
+            continue;
+          }
           if (mOptions.transposeMmaOutput && options.mEpilogueTileM == mOptions.epilogueTileM) {
             mPassingConfigIndices.push_back(i);
@@ Expand All / @@ -122,6 +124,8 @@ TrtllmGenBatchedGemmRunner::TrtllmGenBatchedGemmRunner( @@
                 << ", mDtypeB: " << tg::dtypeToString(mOptions.dtypeB)
                 << ", mDtypeC: " << tg::dtypeToString(mOptions.dtypeC)
                 << ", mUseDeepSeekFp8: " << mOptions.deepSeekFp8
+                << ", mActType: " << (int64_t)mOptions.actType
+                << ", mEltwiseActType: " << (int64_t)mOptions.eltwiseActType
                 << ", mTransposeMmaOutput: " << mOptions.transposeMmaOutput
                 << ", mRouteAct: " << mOptions.routeAct << ", mFusedAct: " << mOptions.fusedAct
                 << ", mIsStaticBatch: " << mOptions.staticBatch << ", mTileSize: " << mOptions.tileSize;
@@ Expand Down Expand Up / @@ -219,6 +223,8 @@ void TrtllmGenBatchedGemmRunner::run( @@
       gemmData.mInputBuffers.mPtrSfB = mOptions.transposeMmaOutput ? sfA : sfB;
       gemmData.mInputBuffers.mPtrScaleC = scaleC;
       gemmData.mInputBuffers.mPtrScaleGate = scaleGateC;
+      // For simplicity pass set scaleAct to scaleGateC
+      gemmData.mInputBuffers.mPtrScaleAct = scaleGateC;
       gemmData.mInputBuffers.mPtrPerTokenSfA =
           mOptions.transposeMmaOutput ? perTokensSfB : perTokensSfA;
       gemmData.mInputBuffers.mPtrPerTokenSfB =
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: Support Fused MoE non gated Relu2 NVFP4 & FP8 and support Nemotron #2304

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

amitz-nv Jan 7, 2026 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

feat: Support Fused MoE non gated Relu2 NVFP4 & FP8 and support Nemotron #2304

Uh oh!

feat: Support Fused MoE non gated Relu2 NVFP4 & FP8 and support Nemotron #2304

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

amitz-nv Jan 7, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

amitz-nv Jan 7, 2026 •

edited

Loading