NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 23 additions & 8 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h‎
Lines changed: 3 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl‎
Lines changed: 71 additions & 16 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl‎
Lines changed: 71 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 20 additions & 7 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h‎
Lines changed: 20 additions & 7 deletions
@@ -315,7 +315,8 @@ struct QuantParams
     {
         struct GroupwiseGemmInputs
         {
-            void const* act_scales = nullptr;
+            bool use_per_expert_act_scale = false;
+            void const* act_scales = nullptr; // (1 or num_experts_per_node, hidden_size or intermediate_size)
             void const* weight_scales = nullptr;
             void const* weight_zeros = nullptr;
             float const* alpha = nullptr;
@@ -401,12 +402,15 @@ struct QuantParams
     static QuantParams GroupWise(int group_size, void const* fc1_weight_scales, void const* fc2_weight_scales,
         void const* fc1_activation_scales = nullptr, void const* fc2_activation_scales = nullptr,
         void const* fc1_weight_zeros = nullptr, void const* fc2_weight_zeros = nullptr,
-        float const* fc1_alpha = nullptr, float const* fc2_alpha = nullptr)
+        float const* fc1_alpha = nullptr, float const* fc2_alpha = nullptr, bool fc1_use_per_expert_act_scale = false,
+        bool fc2_use_per_expert_act_scale = false)
     {
         QuantParams qp;
         qp.groupwise.group_size = group_size;
-        qp.groupwise.fc1 = {fc1_activation_scales, fc1_weight_scales, fc1_weight_zeros, fc1_alpha};
-        qp.groupwise.fc2 = {fc2_activation_scales, fc2_weight_scales, fc2_weight_zeros, fc2_alpha};
+        qp.groupwise.fc1
+            = {fc1_use_per_expert_act_scale, fc1_activation_scales, fc1_weight_scales, fc1_weight_zeros, fc1_alpha};
+        qp.groupwise.fc2
+            = {fc2_use_per_expert_act_scale, fc2_activation_scales, fc2_weight_scales, fc2_weight_zeros, fc2_alpha};
         return qp;
     }
 
@@ -646,7 +650,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
         ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast,
         cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-        int* num_active_experts_per, int* active_expert_global_ids);
+        int* num_active_experts_per, int* active_expert_global_ids, void const* fc2_prequant_scale = nullptr);
 
     static void gemm2(MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
         DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input, void* const gemm_output,
@@ -803,6 +807,16 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         bool min_latency_mode, bool use_awq);
 
 private:
+    static bool useAwq(cutlass_kernels::QuantParams const& quant_params)
+    {
+        return quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
+    }
+
+    static bool usePrequantScaleKernel(cutlass_kernels::QuantParams const& quant_params)
+    {
+        return useAwq(quant_params) && !std::is_same_v<T, WeightType>;
+    }
+
     bool mayHaveDifferentGEMMOutputType() const
     {
         // We just check if its supported because we need to know when calculating workspace size
@@ -813,13 +827,13 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
     bool mayHaveFinalizeFused() const
     {
         return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() >= 90 && use_fused_finalize_
-            && !use_w4_groupwise;
+            && !use_wfp4a16;
     }
 
     static bool mayHaveFinalizeFused(int sm)
     {
         using RunnerType = decltype(moe_gemm_runner_);
-        return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_w4_groupwise;
+        return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_wfp4a16;
     }
 
     // TODO: This should eventually take the quant params to give more flexibility
@@ -866,7 +880,8 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
 
     T const* applyPrequantScale(void* smoothed_act, void const* permuted_data, void const* prequant_scales,
         int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq,
-        cudaStream_t stream, int64_t* expert_first_token_offset = nullptr, int const num_experts_per_node = 0);
+        cudaStream_t stream, QuantParams const& quant_params, int64_t* expert_first_token_offset = nullptr,
+        int const num_experts_per_node = 0);
 
     MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType> moe_gemm_runner_;
     std::unique_ptr<DeepSeekBlockScaleGemmRunner> blockscale_gemm_runner_;
 
@@ -28,8 +28,9 @@ namespace cutlass_kernels_oss
 {
 using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput;
 using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
-template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag, typename CTAShape,
-    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
+template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag, EpilogueFusion FUSION,
+    typename CTAShape, typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
     cutlass::WeightOnlyQuantOp QuantOp>
 void sm90_generic_mixed_moe_gemm_kernelLauncher(
     tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
 
@@ -45,6 +45,7 @@
 #include "cutlass/util/tensor_view_io.h"
 
 #include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp"
 #include "cutlass_extensions/epilogue_helpers.h"
 #include "cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
 #include "cutlass_extensions/gemm_configs.h"
@@ -71,11 +72,12 @@ namespace cutlass_kernels_oss
 using namespace tensorrt_llm::kernels::cutlass_kernels;
 namespace tk = tensorrt_llm::common;
 namespace tkc = tensorrt_llm::cutlass_extensions;
+using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion;
 
 using namespace cute;
 
-template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag, typename CTAShape,
-    typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
+template <typename T, typename WeightType, typename GemmOutputType, typename EpilogueTag, EpilogueFusion FUSION,
+    typename CTAShape, typename ClusterShape, typename MainloopScheduleType, typename EpilogueScheduleType,
     cutlass::WeightOnlyQuantOp QuantOp>
 void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size)
@@ -85,6 +87,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     /////////////////////////////////////////////////////////////////////////////////////////////////
     /// GEMM kernel configurations
     /////////////////////////////////////////////////////////////////////////////////////////////////
+    static_assert(FUSION == EpilogueFusion::NONE || FUSION == EpilogueFusion::FINALIZE,
+        "Unimplemented fusion provided to TMA WS Mixed MoE gemm launcher");
+    constexpr static bool IsFinalizeFusion = FUSION == EpilogueFusion::FINALIZE;
 
     // A matrix configuration
     using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
@@ -129,13 +134,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     using ElementD = ElementC;
     using LayoutD = LayoutC;
     constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+    using ElementFinalOutput = typename TllmToCutlassTypeAdapter<GemmOutputType>::type;
+    using ElementBias = ElementFinalOutput;
+    using ElementRouterScales = float;
 
     // Core kernel configurations
     using ElementAccumulator = float;    // Element type for internal accumulation
     using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that supports the intended feature
     using OperatorClass = cutlass::arch::OpClassTensorOp;             // Operator class tag
     using TileShape = CTAShape;                                       // Threadblock-level tile size
     using StageCountType = cutlass::gemm::collective::StageCountAuto; // Stage count maximized based on the tile size
+
+    using EpilogueFusionOp = cutlass::epilogue::fusion::ScaledAccPerRowBiasPerColScaleScatter<
+        typename cutlass::layout::LayoutTranspose<LayoutD>::type, ElementFinalOutput, ElementAccumulator, ElementBias,
+        ElementRouterScales>;
+
     using KernelSchedule
         = std::conditional_t<std::is_same_v<MainloopScheduleType, cutlass::gemm::KernelTmaWarpSpecializedPingpong>,
             cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong,
@@ -145,12 +158,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
             cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong,
             cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative>; // Epilogue to launch
 
-    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
+    using CollectiveEpilogueFinalize = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
+        cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+        ElementAccumulator, ElementAccumulator, ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type*,
+        AlignmentC, void, typename cutlass::layout::LayoutTranspose<LayoutD>::type*, AlignmentD, EpilogueSchedule,
+        EpilogueFusionOp>::CollectiveOp;
+
+    using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder<cutlass::arch::Sm90,
         cutlass::arch::OpClassTensorOp, TileShape, ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
         ElementAccumulator, ElementAccumulator, ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type*,
         AlignmentC, ElementD, typename cutlass::layout::LayoutTranspose<LayoutD>::type*, AlignmentD,
         EpilogueSchedule>::CollectiveOp;
 
+    using CollectiveEpilogue
+        = std::conditional_t<IsFinalizeFusion, CollectiveEpilogueFinalize, CollectiveEpilogueDefault>;
+
     // =========================================================== MIXED INPUT WITH SCALES
     // =========================================================================== The Scale information must get paired
     // with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the
@@ -175,20 +197,56 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
     Args arguments;
 
     decltype(arguments.epilogue.thread) fusion_args;
-    fusion_args.alpha = use_wfp4a16 ? 1 : 0;
-    fusion_args.beta = 0;
-    fusion_args.alpha_ptr = nullptr;
-    fusion_args.beta_ptr = nullptr;
-    fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales;
-    fusion_args.beta_ptr_array = nullptr;
-    // One alpha and beta per each group
-    fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
-    fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
 
     cutlass::KernelHardwareInfo hw_info;
     hw_info.device_id = 0;
     hw_info.sm_count = sm_count_;
 
+    using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+    using EpilogueScalars = decltype(EpilogueArguments{}.thread);
+    EpilogueScalars epilogue_scalars = [&]
+    {
+        if constexpr (IsFinalizeFusion)
+        {
+            auto epi_params = hopper_inputs.fused_finalize_epilogue;
+            return EpilogueScalars{ElementAccumulator(1), nullptr, hopper_inputs.alpha_scale_ptr_array,
+                Stride<_0, _0, int64_t>{cute::_0{}, cute::_0{}, 1},                                          /* alpha */
+                reinterpret_cast<ElementBias const* const*>(epi_params.ptr_bias), Stride<_1, _0, int64_t>{}, /* bias  */
+                epi_params.ptr_router_scales, Stride<_0, _1, int64_t>{},                                     /* scale */
+                reinterpret_cast<ElementFinalOutput*>(epi_params.ptr_final_output),
+                epi_params.stride_final_output_transposed, epi_params.ptr_source_token_index,
+                epi_params.num_rows_in_final_output, epi_params.shape_override, epi_params.use_reduction};
+        }
+        else
+        {
+            return EpilogueScalars{};
+        }
+    }();
+
+    EpilogueArguments epilogue_args = [&]
+    {
+        if constexpr (IsFinalizeFusion)
+        {
+            return EpilogueArguments{epilogue_scalars, nullptr, nullptr, nullptr, nullptr};
+        }
+        else
+        {
+            fusion_args.alpha = use_wfp4a16 ? 1 : 0;
+            fusion_args.beta = 0;
+            fusion_args.alpha_ptr = nullptr;
+            fusion_args.beta_ptr = nullptr;
+            fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales;
+            fusion_args.beta_ptr_array = nullptr;
+            // One alpha and beta per each group
+            fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
+            fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
+
+            return EpilogueArguments{fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
+                reinterpret_cast<StrideC*>(hopper_inputs.stride_c), reinterpret_cast<ElementD**>(hopper_inputs.ptr_d),
+                reinterpret_cast<StrideD*>(hopper_inputs.stride_d)};
+        }
+    }();
+
     arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped,
         {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr},
         {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_weight),
@@ -197,10 +255,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput<T, WeightType,
             reinterpret_cast<StrideA*>(hopper_inputs.stride_act),
             reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
             reinterpret_cast<StrideS*>(hopper_inputs.int4_groupwise_params.stride_s_a), group_size},
-        {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
-            reinterpret_cast<StrideC*>(hopper_inputs.stride_c), reinterpret_cast<ElementD**>(hopper_inputs.ptr_d),
-            reinterpret_cast<StrideD*>(hopper_inputs.stride_d)},
-        hw_info};
+        epilogue_args, hw_info};
 
     assert(group_size == int(inputs.groupwise_quant_group_size));
     if (workspace_size != nullptr)
 
@@ -792,25 +792,37 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
             TLLM_CHECK_WITH_INFO(
                 inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization");
             // EpilogueTag is ignored
+#define SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(SCALE_FACTOR)                                          \
+    if (hopper_inputs.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE)                          \
+    {                                                                                                                  \
+        cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,               \
+            cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE,       \
+            SCALE_FACTOR>(inputs, hopper_inputs, multi_processor_count_, nullptr);                                     \
+    }                                                                                                                  \
+    else                                                                                                               \
+    {                                                                                                                  \
+        cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,               \
+            cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE,           \
+            SCALE_FACTOR>(inputs, hopper_inputs, multi_processor_count_, nullptr);                                     \
+    }
+
             if (inputs.k % 512 == 0)
             {
-                cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                    cutlass_extensions::EpilogueOpDefault, 4>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+                SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(4)
             }
             else if (inputs.k % 256 == 0)
             {
-                cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                    cutlass_extensions::EpilogueOpDefault, 2>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+                SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(2)
             }
             else if (inputs.k % 128 == 0)
             {
-                cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                    cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+                SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(1)
             }
             else
             {
                 TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);
             }
+#undef SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE
             return;
         }
 
@@ -820,7 +832,8 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                 inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization");
             // EpilogueTag is ignored
             cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+                cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, 1>(
+                inputs, hopper_inputs, multi_processor_count_, nullptr);
             return;
         }
 #endif
Original file line number	Diff line number	Diff line change
`@@ -792,25 +792,37 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(`
`792`	`792`	`TLLM_CHECK_WITH_INFO(`
`793`	`793`	`inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization");`
`794`	`794`	`// EpilogueTag is ignored`
	`795`	`+#define SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(SCALE_FACTOR) \`
	`796`	`+ if (hopper_inputs.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE) \`
	`797`	`+ { \`
	`798`	`+ cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType, \`
	`799`	`+ cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE, \`
	`800`	`+ SCALE_FACTOR>(inputs, hopper_inputs, multi_processor_count_, nullptr); \`
	`801`	`+ } \`
	`802`	`+ else \`
	`803`	`+ { \`
	`804`	`+ cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType, \`
	`805`	`+ cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, \`
	`806`	`+ SCALE_FACTOR>(inputs, hopper_inputs, multi_processor_count_, nullptr); \`
	`807`	`+ }`
	`808`	`+`
`795`	`809`	`if (inputs.k % 512 == 0)`
`796`	`810`	`{`
`797`		`- cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,`
`798`		`- cutlass_extensions::EpilogueOpDefault, 4>(inputs, hopper_inputs, multi_processor_count_, nullptr);`
	`811`	`+ SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(4)`
`799`	`812`	`}`
`800`	`813`	`else if (inputs.k % 256 == 0)`
`801`	`814`	`{`
`802`		`- cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,`
`803`		`- cutlass_extensions::EpilogueOpDefault, 2>(inputs, hopper_inputs, multi_processor_count_, nullptr);`
	`815`	`+ SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(2)`
`804`	`816`	`}`
`805`	`817`	`else if (inputs.k % 128 == 0)`
`806`	`818`	`{`
`807`		`- cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,`
`808`		`- cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);`
	`819`	`+ SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(1)`
`809`	`820`	`}`
`810`	`821`	`else`
`811`	`822`	`{`
`812`	`823`	`TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);`
`813`	`824`	`}`
	`825`	`+#undef SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE`
`814`	`826`	`return;`
`815`	`827`	`}`
`816`	`828`
`@@ -820,7 +832,8 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(`
`820`	`832`	`inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization");`
`821`	`833`	`// EpilogueTag is ignored`
`822`	`834`	`cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,`
`823`		`- cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);`
	`835`	`+ cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, 1>(`
	`836`	`+ inputs, hopper_inputs, multi_processor_count_, nullptr);`
`824`	`837`	`return;`
`825`	`838`	`}`
`826`	`839`	`#endif`