From 5a9f69724a38123a65464c95a116e695d83f3ce4 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Tue, 5 Aug 2025 15:24:30 -0700
Subject: [PATCH 01/12] add mxfp8 x mxfp4 cutlass fused moe

---
 .../cutlass_fused_moe_instantiation.cu        |   2 +
 .../cutlass_fused_moe_kernels.cuh             | 326 ++++++---
 .../flashinfer_cutlass_fused_moe_sm100_ops.cu | 174 +++--
 csrc/nv_internal/cpp/kernels/quantization.cu  | 212 ++----
 .../kernels/cutlass_kernels/include/common.h  |   2 +-
 .../include/moe_gemm_kernels.h                |  20 +-
 .../cutlass_kernels/include/moe_kernels.h     |  90 ++-
 .../moe_gemm_tma_ws_mixed_input_launcher.inl  |  30 +-
 .../moe_gemm/moe_gemm_kernels_bf16_fp4.cu     |  24 +
 .../moe_gemm/moe_gemm_kernels_fp16_fp4.cu     |  22 +
 .../moe_gemm/moe_gemm_template_dispatch.h     | 103 ++-
 ...emm_template_dispatch_tma_ws_mixed_dtype.h |  14 +-
 .../moe_tma_warp_specialized_traits.h         |   5 +-
 .../tensorrt_llm/kernels/quantization.cuh     | 653 +++++++-----------
 .../tensorrt_llm/kernels/quantization.h       |  42 +-
 csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp  |  45 +-
 .../tensorrt_llm/thop/fp4Quantize.cpp         |  44 +-
 .../tensorrt_llm/thop/fp4Quantize.h           |   3 +-
 .../tensorrt_llm/thop/fp8Quantize.cpp         |  77 ++-
 .../tensorrt_llm/thop/fp8Quantize.h           |   9 +-
 csrc/trtllm_allreduce_fusion.cu               |   4 +-
 csrc/trtllm_fused_moe_kernel_launcher.cu      |   4 +-
 csrc/trtllm_moe_allreduce_fusion.cu           |   4 +-
 flashinfer/__init__.py                        |   4 +-
 flashinfer/comm/__init__.py                   |   2 +-
 flashinfer/comm/trtllm_ar.py                  |  10 +-
 flashinfer/fp4_quantization.py                |  35 +-
 flashinfer/fp8_quantization.py                |   5 +
 flashinfer/fused_moe/core.py                  |  23 +-
 flashinfer/fused_moe/utils.py                 |   4 +-
 .../comm/trtllm_allreduce_fusion.cuh          |  10 +-
 .../comm/trtllm_moe_allreduce_fusion.cuh      |  10 +-
 tests/test_fp4_quantize.py                    |   8 +-
 tests/test_trtllm_allreduce_fusion.py         |   6 +-
 tests/test_trtllm_cutlass_fused_moe.py        | 125 +++-
 tests/test_trtllm_moe_allreduce_fusion.py     |   6 +-
 36 files changed, 1225 insertions(+), 932 deletions(-)
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu

diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
index a81691cf9..f20729f16 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
@@ -45,11 +45,13 @@ template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half, half>;
+template class CutlassMoeFCRunner<half, __nv_fp4_e2m1>;
 #ifdef ENABLE_BF16
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>;
 #endif
 #endif
 };  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
index 6c8789e9a..f9151bff4 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -286,7 +286,6 @@ void buildMinLatencyActiveExpertMaps(int* num_active_experts_per_node,
                      num_tokens, experts_per_token, start_expert, end_expert, num_experts_per_node,
                      smart_routing, cluster_rank, cluster_size, num_experts_smem);
 }
-
 template <int BLOCK_SIZE, int EXPERTS_PER_TOKEN, int LOG2_NUM_EXPERTS>
 __global__ void fusedBuildExpertMapsSortFirstTokenKernel(
     int const* const token_selected_experts, int* const permuted_row_to_unpermuted_row,
@@ -963,13 +962,13 @@ __device__ auto quantizePackedFPXValue(
     TmaWarpSpecializedGroupedGemmInput::ElementSF* act_sf_flat,
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType scaling_type) {
   constexpr bool is_fp8 = std::is_same_v<QuantizedType, __nv_fp8_e4m3>;
-  static constexpr int NumThreadsPerSF = VecSize / CVT_FP4_ELTS_PER_THREAD;
+      static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD;
   // Quantize the input to FP4
   static_assert(std::is_same_v<GemmOutputType, __nv_bfloat16> ||
                 std::is_same_v<GemmOutputType, half>);
-  static_assert(ComputeElem::kElements == CVT_FP4_ELTS_PER_THREAD);
+  static_assert(ComputeElem::kElements == CVT_ELTS_PER_THREAD);
   PackedVec<GemmOutputType> packed_vec{};
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
     packed_vec.elts[i].x = static_cast<GemmOutputType>(post_act_val[i * 2 + 0]);
     packed_vec.elts[i].y = static_cast<GemmOutputType>(post_act_val[i * 2 + 1]);
   }
@@ -980,10 +979,9 @@ __device__ auto quantizePackedFPXValue(
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                                   NumThreadsPerSF, VecSize>(
+  auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
       std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
+      std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
 
   // Do the conversion and set the output and scaling factor
   auto func = [&]() {
@@ -1020,18 +1018,16 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                                   NumThreadsPerSF, VecSize>(
+  auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
       std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
+      std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
   if (sf_out) {
     if (input_sf) {
       auto const sf_in =
-          cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                             NumThreadsPerSF, VecSize>(
+          cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
               std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
-              num_cols, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-              FP4QuantizationSFLayout::SWIZZLED_128x4);
+              num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+              QuantizationSFLayout::SWIZZLED);
       *sf_out = *sf_in;
     } else {
       *sf_out = 0x00;
@@ -1127,7 +1123,12 @@ __device__ void computeTmaWarpSpecializedInputStrides(
   if (layout_info.int4_groupwise_params.enabled) {
     layout_info.int4_groupwise_params.stride_s_a[out_idx] = cutlass::make_cute_packed_stride(
         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::StrideSFA{},
-        cute::make_shape(gemm_n, gemm_k / 128, 1));
+        cute::make_shape(gemm_n,
+          gemm_k
+              / (layout_info.int4_groupwise_params.use_wfp4a16
+                      ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+                      : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size),
+          1));
   }
 }
 
@@ -1150,8 +1151,14 @@ __device__ void computeTmaWarpSpecializedInputPointers(
         safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
   }
   if (layout_info.int4_groupwise_params.enabled) {
-    layout_info.int4_groupwise_params.ptr_s_a[out_idx] =
-        safe_inc_ptr(w4a8_weight_scale, expert * (gemm_n * gemm_k / 128));
+        // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2 bytes
+        layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(w4a8_weight_scale,
+          expert
+              * (gemm_n * gemm_k
+                  / (layout_info.int4_groupwise_params.use_wfp4a16
+                          ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2
+                          : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size)));
+
   }
 }
 
@@ -1453,7 +1460,7 @@ __global__ void expandInputRowsKernel(
                                    : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
 
   constexpr int64_t ELEM_PER_THREAD = (is_nvfp4 || is_mxfp8)
-                                          ? CVT_FP4_ELTS_PER_THREAD
+                                          ? CVT_ELTS_PER_THREAD
                                           : (128 / sizeof_bits<InputActivationsType>::value);
 
   // This should be VecSize * 4 elements
@@ -1977,16 +1984,67 @@ void finalizeMoeRoutingKernelLauncher(
 // INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16);
 // #endif
 
+// ============================== Activation Adaptors =================================
+template <template <class> class ActFn>
+struct IdentityAdaptor
+{
+    constexpr static bool IS_GLU = false;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& x) const
+    {
+        ActFn<T> fn{};
+        return fn(x);
+    }
+};
+
+template <template <class> class ActFn>
+struct GLUAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        ActFn<T> fn{};
+        return fn(gate) * linear;
+    }
+};
+
+struct SwigluBiasAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        cutlass::epilogue::thread::Sigmoid<T> fn{};
+        T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+        T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+        return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+    }
+};
+
+
 // ============================== Gated Activation =================================
 constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256;
 
-template <class ActivationOutputType, class GemmOutputType, template <class> class ActFn>
+template <class ActivationOutputType, class GemmOutputType, class ActFn>
 __global__ void doGatedActivationKernel(ActivationOutputType* output,
                                         GemmOutputType const* gemm_result,
-                                        int64_t const* num_valid_tokens_ptr, int64_t inter_size) {
+                                        int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_experts_per_node, ActivationParams activation_type) {
   int64_t const tid = threadIdx.x;
   int64_t const token = blockIdx.x;
-  if (num_valid_tokens_ptr && token >= *num_valid_tokens_ptr) {
+  if (token >= expert_first_token_offset[num_experts_per_node]) {
     return;
   }
 
@@ -2006,42 +2064,64 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output,
   int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD;
   int64_t const inter_size_vec = inter_size / ACTIVATION_ELEM_PER_THREAD;
 
-  ActFn<ComputeElem> fn{};
-  for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
-    auto fc1_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-    // BF16 isn't supported, use FP32 for activation function
-    auto gate_value =
-        arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
-    auto gate_act = fn(gate_value);
-    output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(fc1_value * gate_act);
+  float gate_alpha = 1.0f;
+  float gate_bias = 0.0f;
+  float gate_limit = std::numeric_limits<float>::infinity();
+  if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit)
+  {
+      int expert
+          = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, (int64_t) token + 1) - 1;
+      gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
+      gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f;
+      gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert]
+                                                : std::numeric_limits<float>::infinity();
+  }
+
+
+  ActFn fn{};
+  fn.alpha = gate_alpha;
+  fn.beta = gate_bias;
+  fn.limit = gate_limit;
+  for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)
+  {
+      auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+      // BF16 isn't supported, use FP32 for activation function
+      auto gate_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
+      auto gate_act = fn(gate_value, linear_value);
+      output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(gate_act);
   }
 }
 
 template <typename ActivationOutputType, typename GemmOutputType>
 void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_result,
-                       int64_t const* num_valid_tokens_ptr, int64_t inter_size, int64_t num_tokens,
-                       ActivationType activation_type, cudaStream_t stream) {
+                       int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_tokens, int64_t num_experts_per_node,
+                       ActivationParams activation_type, cudaStream_t stream) {
   int64_t const blocks = num_tokens;
   int64_t const threads = ACTIVATION_THREADS_PER_BLOCK;
 
-  auto* fn = activation_type == ActivationType::Swiglu
-                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
-                                            cutlass::epilogue::thread::SiLu>
-                 : &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
-                                            cutlass::epilogue::thread::GELU>;
-  fn<<<blocks, threads, 0, stream>>>(output, gemm_result, num_valid_tokens_ptr, inter_size);
+  auto* fn = (activation_type == ActivationType::Swiglu)
+  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::SiLu>>
+  : activation_type == ActivationType::Geglu
+  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::GELU>>
+  : activation_type == ActivationType::SwigluBias
+  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, SwigluBiasAdaptor>
+  : nullptr;
+TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type");
+fn<<<blocks, threads, 0, stream>>>(
+  output, gemm_result, expert_first_token_offset, inter_size, num_experts_per_node, activation_type);
+
 }
 
 // ============================== Activation =================================
 
-template <class T, class GemmOutputType, class ScaleBiasType, template <class> class ActFn,
+template <class T, class GemmOutputType, class ScaleBiasType, class ActFn,
           TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType>
 __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                                    float const* fp8_quant, ScaleBiasType const* bias_ptr,
                                    bool bias_is_broadcast, int64_t const* expert_first_token_offset,
-                                   int num_experts_per_node, int64_t inter_size, bool gated,
-                                   float const* fc2_act_global_scale, bool use_per_expert_act_scale,
-                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat) {
+                                   int num_experts_per_node, int64_t inter_size, float const* fc2_act_global_scale, bool use_per_expert_act_scale,
+                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, ActivationParams activation_params)
+                                {
 #ifdef ENABLE_FP4
   constexpr bool IsNVFP4 =
       std::is_same_v<T, __nv_fp4_e2m1> &&
@@ -2055,12 +2135,10 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 #endif
 
   int64_t const tid = threadIdx.x;
-  size_t const gated_size_mul = gated ? 2 : 1;
-  size_t const gated_off = gated ? inter_size : 0;
+  constexpr bool IsGated = ActFn::IS_GLU;
+  size_t gated_size_mul = IsGated ? 2 : 1;
+  size_t gated_off = IsGated ? inter_size : 0;
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
-#endif
 
   constexpr int64_t VecSize = IsNVFP4
                                   ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -2068,7 +2146,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
   // Load 128-bits per thread, according to the smallest data type we read/write
   constexpr int64_t ACTIVATION_ELEM_PER_THREAD =
       (IsNVFP4 || IsMXFP8)
-          ? CVT_FP4_ELTS_PER_THREAD
+          ? CVT_ELTS_PER_THREAD
           : (128 / std::min(sizeof_bits<T>::value, sizeof_bits<GemmOutputType>::value));
 
   // This should be VecSize * 4 elements
@@ -2080,16 +2158,30 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 
   int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
   for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) {
     size_t gemm_result_offset = token * inter_size * gated_size_mul;
     size_t output_offset = token * inter_size;
 
     int64_t expert = 0;
-    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale) {
+    float gate_alpha = 1.0f;
+    float gate_beta = 0.0f;
+    float gate_limit = std::numeric_limits<float>::infinity();
+    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale || activation_params.swiglu_alpha
+        || activation_params.swiglu_beta || activation_params.swiglu_limit)
+{
       // TODO this is almost certainly faster as a linear scan
       expert =
           findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) -
           1;
+          gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
+          gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
+          gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
+                                                      : std::numeric_limits<float>::infinity();
+    
     }
 
     size_t act_scale_idx = use_per_expert_act_scale ? expert : 0;
@@ -2122,7 +2214,10 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0);
     int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD;
 
-    ActFn<ComputeElem> fn{};
+    ActFn fn{};
+    fn.alpha = gate_alpha;
+    fn.beta = gate_beta;
+    fn.limit = gate_limit;
     for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
       auto fc1_value =
           arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + gated_off_vec]);
@@ -2131,15 +2226,22 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                     arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index + gated_off_vec]);
       }
 
-      auto gate_act = fn(fc1_value);
+      auto gate_act = [&]() {
+        if constexpr (IsGated) {
+          auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+          if (bias_ptr_vec)
+          {
+              linear_value = linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+          }
+          return fn(fc1_value, linear_value);
 
-      if (gated) {
-        auto gate_mul = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-        if (bias_ptr_vec) {
-          gate_mul = gate_mul + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
         }
-        gate_act = gate_act * gate_mul;
-      }
+        else
+        {
+            return fn(fc1_value);
+        }
+    }();
+
 
       auto post_act_val = gate_act * quant_scale;
 
@@ -2228,7 +2330,7 @@ template <class T, class GemmOutputType, class ScaleBiasType>
 void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8_quant,
                   ScaleBiasType const* bias, bool bias_is_broadcast,
                   int64_t const* expert_first_token_offset, int num_experts_per_node,
-                  int64_t inter_size, int64_t expanded_num_tokens, ActivationType activation_type,
+                  int64_t inter_size, int64_t expanded_num_tokens, ActivationParams activation_type,
                   QuantParams const& quant_params, bool use_per_expert_act_scale,
                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat,
                   cudaStream_t stream) {
@@ -2249,20 +2351,24 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   auto fn = [&]() {
     auto fn = [&](auto block_scaling_type) {
       auto fn_list = std::array{
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Gelu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::ReLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
                               decltype(block_scaling_type)::value>,  // Relu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Silu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Swiglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Geglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::Identity,
-                              decltype(block_scaling_type)::value>  // Identity
+                              &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                              decltype(block_scaling_type)::value>, // SwigluBias
+                          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::Identity>,
+                              decltype(block_scaling_type)::value> // Identity
+          
       };
-      return fn_list[static_cast<int>(activation_type)];
+      return fn_list[static_cast<int>(activation_type.activation_type)];
     };
     auto NVFP4 = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
@@ -2298,9 +2404,9 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   config.numAttrs = 1;
   config.attrs = attrs;
   cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast,
-                     expert_first_token_offset, num_experts_per_node, inter_size,
-                     isGatedActivation(activation_type), quant_params.fp4.fc2.act_global_scale,
-                     use_per_expert_act_scale, fc2_act_sf_flat);
+                     expert_first_token_offset, num_experts_per_node, inter_size, quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale,
+                     fc2_act_sf_flat, activation_type);
+             
 }
 
 // ============================== Lora Add Bias =================================
@@ -2719,7 +2825,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
 
   bool const is_gated_activation = isGatedActivation(activation_type);
   bool const gemm1_using_fused_moe = moe_gemm_runner_.isFusedGatedActivation(
-      *gemm1_config_, is_gated_activation, inter_size, hidden_size);
+      *gemm1_config_, activation_type, inter_size, hidden_size);
   bool const gemm1_using_tma_ws = moe_gemm_runner_.isTmaWarpSpecialized(*gemm1_config_);
   bool const tma_ws_has_glu =
       gemm1_using_tma_ws && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
@@ -2824,7 +2930,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
     WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
     float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows,
     int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-    ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream) {
+    ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream) {
   bool const is_gated_activation = isGatedActivation(fc1_activation_type);
 
   int shape_n = is_gated_activation ? inter_size * 2 : inter_size;
@@ -2854,7 +2960,6 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
     OutputType* const final_output, int64_t const* const expert_first_token_offset,
     WeightType const* const fc2_expert_weights, ScaleBiasType const* const fc2_expert_biases,
     float const* const unpermuted_final_scales, int const* const unpermuted_row_to_permuted_row,
-
     int const* const permuted_row_to_unpermuted_row, int const* const token_selected_experts,
     int64_t const* const num_valid_tokens_ptr, int64_t const num_rows,
     int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size,
@@ -2917,7 +3022,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
     int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-    int64_t const inter_size, int const num_experts_per_node, ActivationType fc1_activation_type,
+    int64_t const inter_size, int const num_experts_per_node, ActivationParams fc1_activation_type,
     float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream,
     cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
     int* num_active_experts_per, int* active_expert_global_ids) {
@@ -2933,7 +3038,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
   bool const using_tma_ws_gemm1 = gemm_runner.isTmaWarpSpecialized(config);
   bool const is_gated_activation = isGatedActivation(fc1_activation_type);
   bool const use_ampere_activation_fusion =
-      gemm_runner.isFusedGatedActivation(config, is_gated_activation, inter_size, hidden_size);
+      gemm_runner.isFusedGatedActivation(config, fc1_activation_type.activation_type, inter_size, hidden_size);
   size_t const fc1_out_size =
       ((!use_ampere_activation_fusion) && is_gated_activation) ? inter_size * 2 : inter_size;
 
@@ -3125,8 +3230,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
       using GatedActOutputType = std::conditional_t<use_w4afp8, BackBoneType, T>;
       doGatedActivation<GatedActOutputType, UnfusedGemmOutputType>(
           reinterpret_cast<GatedActOutputType*>(output),
-          static_cast<UnfusedGemmOutputType const*>(intermediate_result), num_valid_tokens_ptr,
-          inter_size, expanded_num_rows, fc1_activation_type, stream);
+          static_cast<UnfusedGemmOutputType const*>(intermediate_result), expert_first_token_offset,
+          inter_size, expanded_num_rows, num_experts_per_node, fc1_activation_type, stream);
 
       sync_check_cuda_error(stream);
     }
@@ -3233,7 +3338,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                       static_cast<UnfusedGemmOutputType const*>(gemm_output), nullptr,
                       static_cast<ScaleBiasType const*>(fc2_lora), false, expert_first_token_offset,
                       num_experts_per_node, hidden_size, expanded_num_rows,
-                      ActivationType::Identity, {}, false, nullptr, stream);
+                      ActivationParams(ActivationType::Identity), {}, false, nullptr, stream);
     sync_check_cuda_error(stream);
   }
 
@@ -3478,7 +3583,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     void const* input_activations_void, void const* input_sf_void,
     int const* token_selected_experts, float const* token_final_scales,
     void const* fc1_expert_weights_void, void const* fc1_expert_biases_void,
-    ActivationType fc1_activation_type, void const* fc2_expert_weights_void,
+    ActivationParams fc1_activation_type, void const* fc2_expert_weights_void,
     void const* fc2_expert_biases_void, QuantParams quant_params, int64_t const num_rows,
     int64_t const hidden_size, int64_t const inter_size, int const full_num_experts,
     int const experts_per_token, char* workspace_ptr, void* final_output_void,
@@ -3487,7 +3592,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     bool use_deepseek_fp8_block_scale, bool min_latency_mode,
     MoeMinLatencyParams& min_latency_params, cudaStream_t stream) {
   static constexpr bool int_scales_required = std::is_same<WeightType, uint8_t>::value ||
-                                              std::is_same<WeightType, cutlass::uint4b_t>::value;
+                                              std::is_same<WeightType, cutlass::uint4b_t>::value || use_wfp4a16;
   static constexpr bool fp8_scales_required = std::is_same<WeightType, __nv_fp8_e4m3>::value ||
                                               std::is_same<WeightType, __nv_fp8_e5m2>::value;
 
@@ -3598,7 +3703,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                          "Scales are ignored for fp32/fp16/bf16 but received quant scale for FC2");
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
   int const num_experts_per_node = full_num_experts / parallelism_config.ep_size;
 
   configureWsPtrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts_per_node,
@@ -3662,7 +3767,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     sync_check_cuda_error(stream);
   } else {
     bool fused_prologue_result = false;
-    if (!use_w4afp8) {
+    if (!use_w4_groupwise) {
       // WAR: fusedBuildExpertMapsSortFirstToken kernel will lead to illegal memory access for
       // W4AFP8
       fused_prologue_result = fusedBuildExpertMapsSortFirstToken(
@@ -3809,8 +3914,11 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     layout_info2.alpha_scale_ptr_array = nullptr;
   }
 
-  layout_info1.int4_groupwise_params.enabled = use_w4afp8;
-  layout_info2.int4_groupwise_params.enabled = use_w4afp8;
+  layout_info1.int4_groupwise_params.enabled = use_w4_groupwise;
+  layout_info2.int4_groupwise_params.enabled = use_w4_groupwise;
+  layout_info1.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
+  layout_info2.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
+
 
   layout_info1.fpX_block_scaling_type = getScalingType();
   layout_info2.fpX_block_scaling_type = getScalingType();
@@ -3856,7 +3964,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         UnfusedGemmOutputType* output1, UnfusedGemmOutputType* output2,
         int const* num_active_experts_per, int const* active_expert_global_ids, int start_expert,
         cudaStream_t stream) {
-  TLLM_CHECK_WITH_INFO(!use_w4afp8, "W4AFP8 is not supported in low latency mode");
+          TLLM_CHECK_WITH_INFO(!use_w4_groupwise, "W4AFP8 and WFP4A16 are not supported in low latency mode");
 
   // Always nullptr
   layout_info1.ptr_c = nullptr;
@@ -3879,6 +3987,9 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
   layout_info1.int4_groupwise_params.enabled = false;
   layout_info2.int4_groupwise_params.enabled = false;
+  layout_info1.int4_groupwise_params.use_wfp4a16 = false;
+  layout_info2.int4_groupwise_params.use_wfp4a16 = false;
+
 
   int const threads = std::min(1024, num_experts);
   int const blocks = (num_experts + threads - 1) / threads;
@@ -3909,7 +4020,7 @@ template <class T, class WeightType, class OutputType, class InputType, class Ba
 std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
 CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
-                                  ActivationType fc1_activation_type, int64_t hidden_size,
+      ActivationParams  fc1_activation_type, int64_t hidden_size,
                                   int64_t inter_size, int64_t num_experts_per_node,
                                   void const* input_activations_void,
                                   TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
@@ -3927,7 +4038,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     return std::make_pair(gemm1_tma_ws_input, gemm2_tma_ws_input);
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
 
   bool is_gated_activation = isGatedActivation(fc1_activation_type);
   int64_t const fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size;
@@ -3963,7 +4074,7 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
     bool apply_bias = parallelism_config.tp_rank == 0;
     bool using_hopper_fused_finalize = !use_deterministic_hopper_reduce_ &&
-                                       gemm2_config_->sm_version == 90 && !use_w4afp8 && !use_lora;
+                                       gemm2_config_->sm_version == 90 && !use_w4_groupwise && !use_lora;
     if (using_hopper_fused_finalize) {
       assert(min_latency_mode == false);
       gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
@@ -4227,6 +4338,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   bool is_fp4_w_quant = mWType == nvinfer1::DataType::kFP4 || mWType == nvinfer1::DataType::kINT64;
   bool is_w4afp8_quant = is_int_groupwise_w_quant && is_fp8_act_quant;
   // bool is_wfp4afp8_quant = is_fp4_w_quant && is_fp8_act_quant;
+  bool is_wfp4a16_quant = (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) && mWType == nvinfer1::DataType::kUINT8;
+
 
   // Int sizes
   size_t quant_1_size = is_int_w_quant ? fc1_out_size * num_experts_per_node * dtype_bytes : 0;
@@ -4234,7 +4347,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   if (is_int_w_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes;
-  } else if (is_int_groupwise_w_quant) {
+  } else if (is_int_groupwise_w_quant  || is_wfp4a16_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes * hidden_size / mGroupSize;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes * inter_size / mGroupSize;
   }
@@ -4269,7 +4382,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         TmaWarpSpecializedGroupedGemmInput::workspaceSize(num_experts_per_node, mScalingType) *
         (NUM_ROUTING_SAMPLES + 1);
 
-    if (is_w4afp8_quant) {
+    if (is_w4afp8_quant  || is_wfp4a16_quant) {
       quant_3_size = 0;
       quant_4_size = 0;
     }
@@ -4287,7 +4400,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
       sizeof(TmaWarpSpecializedGroupedGemmInput::ElementSF);
   size_t const fp4_act_scale_flat_size = std::max(fc1_fp4_act_scale_size, fc2_fp4_act_scale_size);
 
-  size_t w4a8_alpha_size = is_w4afp8_quant ? num_experts_per_node * sizeof(float) : 0;
+  size_t w4a8_alpha_size = (is_w4afp8_quant || is_wfp4a16_quant) ? num_experts_per_node * sizeof(float) : 0;
   size_t alpha_scale_ptr_array_size = num_experts_per_node * sizeof(float**);
   size_t gemm_workspace_size = mInterface->getGemmWorkspaceSize(num_experts_per_node);
 
@@ -4316,6 +4429,11 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   size_t active_expert_global_ids_size =
       mMinLatencyMode ? mNumExpertsPerNode * sizeof(int) * NUM_ROUTING_SAMPLES : 0;
 
+  bool is_swiglu_bias = mActivationType == ActivationType::SwigluBias && mGemmToProfile == GemmToProfile::GEMM_1;
+  size_t swiglu_alpha_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+  size_t swiglu_beta_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+  size_t swiglu_limit_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+
   size_t map_offset = 0;
   std::map<std::string, std::pair<size_t, size_t>> out_map;
 
@@ -4354,6 +4472,9 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   ADD(alpha_scale_ptr_array);
   ADD(fp4_act_scale_flat);
   ADD(gemm_workspace);
+  ADD(swiglu_alpha);
+  ADD(swiglu_beta);
+  ADD(swiglu_limit);
 
 #undef ADD_NAME
 #undef ADD
@@ -4441,13 +4562,18 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr
   GET_WS_PTR(float const*, w4a8_alpha);
 #undef GET_WS_PTR
 
-  if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4) &&
-      mGroupSize < 0) {
+if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4
+  || mWType == nvinfer1::DataType::kUINT8)
+&& mGroupSize < 0)
+{
     TLLM_CHECK(quant_1 && quant_2);
     mQuantParams = QuantParams::Int(quant_1, quant_2);
-  } else if (mWType == nvinfer1::DataType::kINT4) {
+  } else if (mWType == nvinfer1::DataType::kINT4 || mWType == nvinfer1::DataType::kUINT8) {
     TLLM_CHECK(quant_1 && quant_2);
-    if (mDType == nvinfer1::DataType::kFP8) {
+    if (mDType == nvinfer1::DataType::kFP8
+      || (mWType == nvinfer1::DataType::kUINT8
+          && (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)))
+{
       TLLM_CHECK(w4a8_alpha);
       mQuantParams = QuantParams::GroupWise(mGroupSize, quant_1, quant_2, nullptr, nullptr, quant_3,
                                             quant_4, w4a8_alpha, w4a8_alpha);
@@ -4546,8 +4672,12 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
 
       bool apply_bias = true;
       bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
+      bool use_wfp4a16 = ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)
+      && mWType == nvinfer1::DataType::kUINT8);
+  bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+
       bool using_fused_finalize = !mInterface->use_deterministic_hopper_reduce_ && mSM == 90 &&
-                                  !mMinLatencyMode && !use_w4afp8;
+                                  !mMinLatencyMode && !use_w4_groupwise;
       if (using_fused_finalize) {
         assert(!mMinLatencyMode);
         gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
@@ -4651,6 +4781,11 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
   GET_WS_PTR(TmaWarpSpecializedGroupedGemmInput::ElementSF*, fp4_act_scale_flat);
   GET_WS_PTR(void*, gemm_workspace);
 
+  GET_WS_PTR(float*, swiglu_alpha);
+  GET_WS_PTR(float*, swiglu_beta);
+  GET_WS_PTR(float*, swiglu_limit);
+
+
 #undef GET_WS_PTR_OFFSET
 #undef GET_WS_PTR
 
@@ -4682,7 +4817,7 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
                       mExpertHiddenSize,                       //
                       mExpertInterSize,                        //
                       num_experts_per_node,                    //
-                      mActivationType,                         //
+                      ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),                         //
                       alpha_scale_ptr_array,                   //
                       !mUseLora,                               //
                       /*use_deepseek_fp8_block_scale=*/false,  //
@@ -4734,3 +4869,4 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
 }
 
 }  // namespace tensorrt_llm::kernels::cutlass_kernels
+
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
index 8480e09ed..3ed70d356 100644
--- a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
+++ b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
@@ -44,10 +44,10 @@ namespace torch_ext {
 
 namespace common = tensorrt_llm::common;
 namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
+using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
 // Always use public header as it is just utility functions and types
-using TmaWarpSpecializedGroupedGemmInput =
-    tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
+using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 using profiler_backend = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::GemmProfilerBackend;
 
 class FusedMoeRunner : public torch::CustomClassHolder {
@@ -60,15 +60,16 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       case c10::ScalarType::Float8_e4m3fn:
         // TODO We need an atomic FP8 reduction for the finalize fusions
         C10_THROW_ERROR_FORMATTED(NotImplementedError,
-                                  "Outputting " << torch::toString(output_type)
-                                                << " directly is not currently supported");
+            "Outputting " << torch::toString(output_type) << " directly is not currently supported");
         // return std::make_unique<kernels::CutlassMoeFCRunner<Type, Type>>();
       case c10::ScalarType::Half:
-        if constexpr (NeedQuant) {
-          return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, half>>();
-        } else {
-          return std::make_unique<
-              kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, TypeAct>>();
+        if constexpr (NeedQuant)
+        {
+            return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, half>>();
+        }
+        else
+        {
+            return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, TypeAct>>();
         }
 #ifdef ENABLE_BF16
       case c10::ScalarType::BFloat16:
@@ -89,12 +90,12 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
   FusedMoeRunner(c10::ScalarType activation_dtype, c10::ScalarType weight_dtype,
                  c10::ScalarType output_dtype, bool use_deepseek_fp8_block_scale,
-                 bool use_w4a8_group_scaling, bool use_mxfp8_act_scaling) {
+                 bool use_w4_group_scaling, bool use_mxfp8_act_scaling) {
     mActivationDtype = activation_dtype;
     mWeightDtype = weight_dtype;
     mOutputDtype = output_dtype;
     mUseDeepSeekFP8BlockScaling = use_deepseek_fp8_block_scale;
-    mUseW4A8GroupScaling = use_w4a8_group_scaling;
+    mUseW4GroupScaling = use_w4_group_scaling;
     mUseMxfp8ActScaling = use_mxfp8_act_scaling;
     mInnerDimMultiplier = 1;
 
@@ -139,12 +140,29 @@ class FusedMoeRunner : public torch::CustomClassHolder {
           mKernelRunner = switch_output_type<__nv_fp4_e2m1, __nv_fp4_e2m1, false>(mOutputDtype);
       }
     }
+
+    if (isWFP4A16Quant())
+    {
+        mInnerDimMultiplier = 2;
+        if (mActivationDtype == c10::ScalarType::Half)
+        {
+            mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<half, __nv_fp4_e2m1>>();
+        }
+#ifdef ENABLE_BF16
+        else if (mActivationDtype == c10::ScalarType::BFloat16)
+        {
+            mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>>();
+        }
+#endif
+    }
+
+
 #endif
     if (isInt4Quant()) {
       mInnerDimMultiplier = 2;
       if (mActivationDtype == c10::ScalarType::Half) {
 #ifdef ENABLE_FP8
-        if (mUseW4A8GroupScaling) {
+        if (mUseW4GroupScaling) {
           mKernelRunner = std::make_unique<
               kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>>();
         } else {
@@ -157,7 +175,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 #ifdef ENABLE_BF16
       else if (mActivationDtype == c10::ScalarType::BFloat16) {
 #ifdef ENABLE_FP8
-        if (mUseW4A8GroupScaling) {
+        if (mUseW4GroupScaling) {
           mKernelRunner =
               std::make_unique<kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t,
                                                            __nv_bfloat16, __nv_bfloat16>>();
@@ -196,15 +214,17 @@ class FusedMoeRunner : public torch::CustomClassHolder {
   void operator=(FusedMoeRunner const&) = delete;
 
   at::Tensor runMoe(
-      at::Tensor& output, at::Tensor const& input, at::Tensor const& token_selected_experts,
-      torch::optional<at::Tensor> const& token_final_scales, at::Tensor const& fc1_expert_weights,
-      torch::optional<at::Tensor> const& fc1_expert_biases, at::Tensor const& fc2_expert_weights,
-      torch::optional<at::Tensor> const& fc2_expert_biases,
-      torch::optional<c10::ArrayRef<at::Tensor>> const& quant_scales,
-      torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
-      int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
-      int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
+    at::Tensor& output, at::Tensor const& input, at::Tensor const& token_selected_experts,
+      torch::optional<torch::Tensor> const& token_final_scales, torch::Tensor const& fc1_expert_weights,
+      torch::optional<torch::Tensor> const& fc1_expert_biases, torch::Tensor const& fc2_expert_weights,
+      torch::optional<torch::Tensor> const& fc2_expert_biases,
+      torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
+      torch::optional<torch::Tensor> const& input_sf, torch::optional<torch::Tensor> const& swiglu_alpha,
+      torch::optional<torch::Tensor> const& swiglu_beta, torch::optional<torch::Tensor> const& swiglu_limit,
+      int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
+      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
+{
     std::lock_guard<std::mutex> lock(mMutex);
     // Free the profile workspace to save memory
     freeProfileWorkspace();
@@ -280,7 +300,33 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
     auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
     auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
-    auto activation_type = ActivationType::Swiglu;
+    ActivationType base_activation_type = ActivationType::Swiglu;
+    if (swiglu_alpha.has_value())
+    {
+        CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
+        TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+            "swiglu_alpha must have num_experts_on_rank elements.");
+        base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_beta.has_value())
+    {
+        CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
+        TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+            "swiglu_beta must have num_experts_on_rank elements.");
+        base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_limit.has_value())
+    {
+        CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
+        TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+            "swiglu_limit must have num_experts_on_rank elements.");
+        base_activation_type = ActivationType::SwigluBias;
+    }
+    auto activation_params = ActivationParams(base_activation_type,
+        reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
+
 
     setRunnerProfiles(profile_ids);
 
@@ -291,7 +337,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
     WorkspaceInfo workspace_info = getWorkspaceInfo(
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        activation_type, parallelism_config, min_latency_mode);
+        base_activation_type, parallelism_config, min_latency_mode);
 
     auto const quant_params =
         getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
@@ -308,7 +354,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
@@ -325,10 +371,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace.data_ptr()),
         output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
         stream);
@@ -343,10 +389,12 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       torch::optional<at::Tensor> const& fc1_expert_biases, at::Tensor const& fc2_expert_weights,
       torch::optional<at::Tensor> const& fc2_expert_biases,
       torch::optional<c10::ArrayRef<at::Tensor>> const& quant_scales,
-      torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
-      int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
-      int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
+      torch::optional<torch::Tensor> const& input_sf, torch::optional<torch::Tensor> const& swiglu_alpha,
+      torch::optional<torch::Tensor> const& swiglu_beta, torch::optional<torch::Tensor> const& swiglu_limit,
+      int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
+      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
+{
     std::lock_guard<std::mutex> lock(mMutex);
 
     // Free the profile workspace to save memory
@@ -405,11 +453,33 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int64_t num_rows = input.sizes()[0];
     int64_t hidden_size = fc2_expert_weights.sizes()[1];
     int64_t inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+
     int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
     auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
-    auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank,
-                                                            cluster_size, cluster_rank);
-    auto activation_type = ActivationType::Swiglu;
+    auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
+    ActivationType base_activation_type = ActivationType::Swiglu;
+    if (swiglu_alpha.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+          "swiglu_alpha must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_beta.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+          "swiglu_beta must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_limit.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+          "swiglu_limit must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    auto activation_params = ActivationParams(base_activation_type,
+        reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
     setRunnerProfiles(profile_ids);
 
@@ -435,7 +505,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
     WorkspaceInfo workspace_info = getWorkspaceInfo(
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        activation_type, parallelism_config, min_latency_mode);
+        base_activation_type, parallelism_config, min_latency_mode);
 
     auto const quant_params =
         getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
@@ -451,7 +521,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
@@ -468,10 +538,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace.data_ptr()),
         output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
         false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
         stream);
@@ -504,7 +574,8 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int64_t const num_rows = input.sizes()[0];
     int64_t const hidden_size = fc2_expert_weights.sizes()[1];
     int64_t const inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
-    int64_t const group_size = isInt4Quant() ? 128 : -1;
+    int64_t const group_size_ = isInt4Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size : -1;
+    int64_t const group_size = isWFP4A16Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size : group_size_;
     int const num_experts = static_cast<int>(fc2_expert_weights.sizes()[0] * ep_size);
 
     // Get specific profile configs according to the profile_id.
@@ -532,7 +603,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       bool const USE_BIAS = fc1_expert_biases.has_value() || fc2_expert_biases.has_value();
       bool const USE_LORA = false;
       auto activation_dtype =
-          mUseW4A8GroupScaling ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
+          (mUseW4GroupScaling && !isWFP4A16Quant()) ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
       activation_dtype = isNvfp4Quant() ? at::ScalarType::Long : activation_dtype;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
@@ -583,7 +654,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
   char* mProfileWorkspace = nullptr;
 
   bool mUseDeepSeekFP8BlockScaling = false;
-  bool mUseW4A8GroupScaling = false;
+  bool mUseW4GroupScaling = false;
   bool mUseMxfp8ActScaling = false;
 
   using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
@@ -629,7 +700,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(
         num_rows, hidden_size, inter_size, num_experts, experts_per_token, activation_type,
         parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling, min_latency_mode,
-        mUseW4A8GroupScaling);
+        mUseW4GroupScaling);
     size_t src_to_dest_map_size = experts_per_token * num_rows * sizeof(int);
 
     std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
@@ -781,8 +852,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
               fc1_weight_block.sizes()[2] * FP8_PER_INT32 *
                       TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize ==
                   TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
-                      hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX) *
-                      TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX,
+                      hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
           "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
           "block_scale_vector_size)");
       TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank,
@@ -888,6 +958,16 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       return kernels::QuantParams::FP8BlockScaling(
           static_cast<float const*>(fc1_scales.data_ptr()),
           static_cast<float const*>(fc2_scales.data_ptr()));
+    } else if (isWFP4A16Quant()) {
+      TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for W4 quantization");
+      TORCH_CHECK(quant_scales.value().size() == 2, "Expecting 2 quant scales for W4A16 quantization");
+
+      auto& fc1_weight_scales = quant_scales.value()[0];
+      auto& fc2_weight_scales = quant_scales.value()[1];
+      int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size;
+      return kernels::QuantParams::GroupWise(group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
+          static_cast<void const*>(fc2_weight_scales.data_ptr()), nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr);
     } else if (isInt4Quant()) {
       TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for INT4 quantization");
       TORCH_CHECK(quant_scales.value().size() == 8,
@@ -900,7 +980,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       auto& fc2_weight_zeros = quant_scales.value()[5];
       auto& fc1_alpha = quant_scales.value()[6];
       auto& fc2_alpha = quant_scales.value()[7];
-      int group_size = 128;
+      int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size;
       return kernels::QuantParams::GroupWise(
           group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
           static_cast<void const*>(fc2_weight_scales.data_ptr()),
@@ -929,6 +1009,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
            mActivationDtype != c10::ScalarType::Float8_e4m3fn;  // FP8 activation does not use FP4
   }
 
+  bool isWFP4A16Quant() const {
+    return mUseW4GroupScaling && mWeightDtype == c10::ScalarType::Byte;
+  }
+
   bool isInt4Quant() const { return mWeightDtype == c10::ScalarType::QUInt4x2; }
 
   bool isW4AFp8Quant() const {
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
index a9a91be85..a335a3f23 100644
--- a/csrc/nv_internal/cpp/kernels/quantization.cu
+++ b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -74,37 +74,36 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 // MXFP8 Quantization
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, T const* input, int64_t* output, int32_t* SFOuput,
-                             FP4QuantizationSFLayout layout, int multiProcessorCount,
-                             cudaStream_t stream) {
-  // Fixed SF_VEC_SIZE as 32
-  static constexpr int SF_VEC_SIZE = 32;
-
-  // Grid, Block size.
-  // Each thread converts 8 values.
-  dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
-  // Get number of blocks per SM (assume we can fully utilize the SM).
-  int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-  // Launch the cvt kernel.
-  cudaLaunchConfig_t config;
-  config.gridDim = grid;
-  config.blockDim = block;
-  config.dynamicSmemBytes = 0;
-  config.stream = stream;
-  cudaLaunchAttribute attrs[1];
-  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-  attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
-  config.numAttrs = 1;
-  config.attrs = attrs;
-  cudaLaunchKernelEx(
-      &config,
-      quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b,
-      m, n, input, nullptr, reinterpret_cast<uint32_t*>(output),
-      reinterpret_cast<uint32_t*>(SFOuput), layout);
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
+    QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream)
+{
+    // Fixed SF_VEC_SIZE as 32
+    static constexpr int SF_VEC_SIZE = 32;
+
+    // Grid, Block size.
+    // Each thread converts 8 values.
+    dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
+    // Get number of blocks per SM (assume we can fully utilize the SM).
+    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
+    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+    // Launch the cvt kernel.
+    cudaLaunchConfig_t config;
+    config.gridDim = grid;
+    config.blockDim = block;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(&config,
+        quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b, m, n, padded_n,
+        input, nullptr, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput), layout);
 }
 
+
 // Do per-token (row) quantization from fp16/bf16/fp32 to int8/fp8_e4m3.
 template <typename T, typename QuantT>
 void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows,
@@ -166,8 +165,8 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 // FP4 Quantization
 
 template <typename T, int SF_VEC_SIZE>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFScale, int64_t* output,
+                           int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout  layout,
                            int multiProcessorCount, cudaStream_t stream) {
 #ifdef ENABLE_FP8
   if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
@@ -179,73 +178,27 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
     dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp8_to_fp4<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4<SF_VEC_SIZE, false>;
-    kernel_instance<<<grid, block, 0, stream>>>(m, n, input, SFScale,
-                                                reinterpret_cast<uint64_t*>(output),
-                                                reinterpret_cast<uint32_t*>(SFOuput), layout);
-  } else
-#endif
-  {
-    // Grid, Block size.
-    // Each thread converts 8 values.
-    dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
-    // Get number of blocks per SM (assume we can fully utilize the SM).
-    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+    auto* kernel_instance = useUE8M0
+    ? &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, true>
+    : &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, false>;
+kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+    reinterpret_cast<uint32_t*>(SFOuput), layout);
 
-    // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp16_to_fp4<T, SF_VEC_SIZE, true> : &cvt_fp16_to_fp4<T, SF_VEC_SIZE, false>;
-    cudaLaunchConfig_t config;
-    config.gridDim = grid;
-    config.blockDim = block;
-    config.dynamicSmemBytes = 0;
-    config.stream = stream;
-    cudaLaunchAttribute attrs[1];
-    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
-    config.numAttrs = 1;
-    config.attrs = attrs;
-    cudaLaunchKernelEx(&config, kernel_instance, m, n, input, SFScale,
-                       reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
-                       layout);
-  }
-}
-
-template <typename T, int SF_VEC_SIZE>
-void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
-                                  int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                  int multiProcessorCount, FP4QuantizationSFLayout layout,
-                                  cudaStream_t stream) {
-#ifdef ENABLE_FP8
-  if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
-    // Grid, Block size.
-    // Each thread converts 16 values.
-    dim3 block(std::min(int(n / CVT_FP8_TO_FP4_ELTS_PER_THREAD), 512));
-    // Get number of blocks per SM (assume we can fully utilize the SM).
-    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
-
-    // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, false>;
-    kernel_instance<<<grid, block, 0, stream>>>(b, m, n, input, SFScale,
-                                                reinterpret_cast<uint32_t*>(output),
-                                                reinterpret_cast<uint32_t*>(SFOuput), layout);
   } else
 #endif
   {
     // Grid, Block size.
     // Each thread converts 8 values.
-    dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
+    dim3 block(std::min(int(n / CVT_ELTS_PER_THREAD), 512));
     // Get number of blocks per SM (assume we can fully utilize the SM).
     int const numBlocksPerSM = std::max(1u, 2048u / block.x);
     dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
-    auto* kernel_instance = useUE8M0 ? &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, true>
-                                     : &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, false>;
+    auto* kernel_instance = useUE8M0
+    ? &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, true>
+    : &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, false>;
+
     cudaLaunchConfig_t config;
     config.gridDim = grid;
     config.blockDim = block;
@@ -256,16 +209,14 @@ void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float con
     attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
     config.numAttrs = 1;
     config.attrs = attrs;
-    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, input, SFScale,
-                       reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
+    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
                        layout);
   }
 }
 
-__global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
+__global__ void block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
                                                     int numCols, int numColsPadded,
                                                     uint8_t const* SFIn, uint8_t* SFOutput) {
-  constexpr int SF_VEC_SIZE = 16;
   for (int rowIdx = blockIdx.x; rowIdx < numRowsPadded; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numColsPadded; colIdx += blockDim.x) {
@@ -282,18 +233,16 @@ __global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows,
         // int const numSfTilesK = (numCols + 4 - 1) / 4;
         // int const tileOffset = ((mi / 128) * numSfTilesK + ki / 4) * 512;
         // int const dstIdx = tileOffset + (mi % 32) * 16 + ((mi % 128) / 32) * 4 + ki % 4;
-        auto dstIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdxOpt, rowIdx, colIdx, numRowsOpt,
-                                                           numCols * SF_VEC_SIZE);
+        auto dstIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
         SFOutput[dstIdx] = sf;
       }
     }
   }
 }
 
-__global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int numRows,
+__global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRows,
                                                             int numCols, uint8_t const* SFIn,
                                                             uint8_t* SFOutput) {
-  constexpr int SF_VEC_SIZE = 16;
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numCols; colIdx += blockDim.x) {
@@ -301,8 +250,7 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int
         std::optional<int> numRowsOpt = numRows;
 
         // Get the swizzled input index using the same swizzling pattern
-        auto srcIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdxOpt, rowIdx, colIdx, numRowsOpt,
-                                                           numCols * SF_VEC_SIZE);
+        auto srcIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
         auto sf = SFIn[srcIdx];
 
         // Output goes to linear layout
@@ -314,7 +262,7 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
                                      uint8_t const* SFIn, uint8_t* SFOutput,
                                      int multiProcessorCount, cudaStream_t stream) {
   // Each thread reads 1 int8 value
@@ -323,12 +271,11 @@ void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_pa
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m_padded, multiProcessorCount * numBlocksPerSM));
 
-  nvfp4_block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn,
-                                                                  SFOutput);
+  block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn, SFOutput);
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
+void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
                                             uint8_t* SFOutput, int multiProcessorCount,
                                             cudaStream_t stream) {
   // Each thread reads 1 int8 value
@@ -337,77 +284,54 @@ void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const*
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
 
-  nvfp4_block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
+  block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
 }
 
 // Instantiate the function.
-template void invokeFP4Quantization<half, 16>(int m, int n, half const* input, float const* SFScale,
+template void invokeFP4Quantization<half, 16>(int b, int m, int n, half const* input, float const* SFScale,
                                               int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              FP4QuantizationSFLayout layout,
+                                              QuantizationSFLayout  layout,
                                               int multiProcessorCount, cudaStream_t stream);
-template void invokeFP4Quantization<half, 32>(int m, int n, half const* input, float const* SFScale,
+template void invokeFP4Quantization<half, 32>(int b, int m, int n, half const* input, float const* SFScale,
                                               int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              FP4QuantizationSFLayout layout,
+                                              QuantizationSFLayout  layout,
                                               int multiProcessorCount, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 16>(
-    int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 32>(
-    int b, int m, int n, half const* input, float const* SFScale, int64_t* output, int32_t* SFOuput,
-    bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout, cudaStream_t stream);
-template void invokeMxFP8Quantization<half>(int b, int m, int n, half const* input, int64_t* output,
-                                            int32_t* SFOuput, FP4QuantizationSFLayout layout,
-                                            int multiProcessorCount, cudaStream_t stream);
+template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input, int64_t* output,
+                                                int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
+                                            
 #ifdef ENABLE_BF16
-template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input,
+template void invokeFP4Quantization<__nv_bfloat16, 16>(int b, int m, int n, __nv_bfloat16 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
+                                                       QuantizationSFLayout  layout,
                                                        int multiProcessorCount,
                                                        cudaStream_t stream);
-template void invokeFP4Quantization<__nv_bfloat16, 32>(int m, int n, __nv_bfloat16 const* input,
+template void invokeFP4Quantization<__nv_bfloat16, 32>(int b, int m, int n, __nv_bfloat16 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
+                                                       QuantizationSFLayout  layout,
                                                        int multiProcessorCount,
                                                        cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 16>(
-    int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(
-    int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
-template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n,
-                                                     __nv_bfloat16 const* input, int64_t* output,
-                                                     int32_t* SFOuput,
-                                                     FP4QuantizationSFLayout layout,
-                                                     int multiProcessorCount, cudaStream_t stream);
+template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n, __nv_bfloat16 const* input,
+                                                        int64_t* output, int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
+                                                    
 
 #endif
 
 #ifdef ENABLE_FP8
-template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4m3 const* input,
+template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int b, int m, int n, __nv_fp8_e4m3 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
+                                                       QuantizationSFLayout  layout,
                                                        int multiProcessorCount,
                                                        cudaStream_t stream);
-template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int m, int n, __nv_fp8_e4m3 const* input,
+template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv_fp8_e4m3 const* input,
                                                        float const* SFScale, int64_t* output,
                                                        int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
+                                                       QuantizationSFLayout  layout,
                                                        int multiProcessorCount,
                                                        cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16>(
-    int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32>(
-    int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
-    cudaStream_t stream);
+
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
index 3c3a70662..5b90fbe76 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -19,6 +19,6 @@
 namespace tensorrt_llm::kernels::cutlass_kernels {
 
 // Note update moe.py to match
-enum class ActivationType { Gelu = 0, Relu, Silu, Swiglu, Geglu, Identity, InvalidType };
+enum class ActivationType { Gelu = 0, Relu, Silu, Swiglu, Geglu, SwigluBias, Identity, InvalidType };
 
 }  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
index 20d7ec4f8..7e3f01031 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -203,8 +203,10 @@ struct TmaWarpSpecializedGroupedGemmInput {
   FpXBlockScalingType fpX_block_scaling_type = FpXBlockScalingType::NONE;
 
   struct INT4GroupwiseParams {
-    constexpr static int group_size = 128;  // Unused, hard-coded to 128
+    constexpr static int int4_group_size = 128;
+    constexpr static int wfp4a16_group_size = 32;
     bool enabled = false;
+    bool use_wfp4a16 = false;
     using SFA = __nv_bfloat16;
     using SFB = __nv_bfloat16;  // Unused
     using ProblemShapeInt = cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
@@ -244,7 +246,7 @@ struct TmaWarpSpecializedGroupedGemmInput {
 };
 
 constexpr bool isGatedActivation(ActivationType activation_type) {
-  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu;
+  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu || activation_type == ActivationType::SwigluBias;
 }
 
 template <typename T,                         /*The type used for activations/scales/compute*/
@@ -255,6 +257,13 @@ template <typename T,                         /*The type used for activations/sc
 class MoeGemmRunner {
  public:
   MoeGemmRunner();
+  
+#if defined(ENABLE_BF16)
+  static constexpr bool use_wfp4a16
+      = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
 
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
@@ -271,6 +280,7 @@ class MoeGemmRunner {
   static constexpr bool use_w4afp8 = false;
   static constexpr bool use_wfp4afp4 = false;
 #endif
+  static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 
 #if defined(ENABLE_FP4)
   static constexpr bool use_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
@@ -296,10 +306,8 @@ class MoeGemmRunner {
 
   [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
   [[nodiscard]] bool supportsTmaWarpSpecialized() const;
-  [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
-                                            bool is_gated_activation, int gemm_n, int gemm_k) const;
-  [[nodiscard]] bool supportsFusedGatedActivation(bool is_gated_activation, int gemm_n,
-                                                  int gemm_k) const;
+  [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n, int gemm_k) const;
+  [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
 
   size_t getMaxWorkspaceSize(int num_experts) const;
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index ce39fa52e..6367120a8 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -24,7 +24,7 @@
 #ifdef ENABLE_FP4
 #include <cuda_fp4.h>
 #endif
-
+#include "tensorrt_llm/common/NvInferRuntime.h"
 #include <cuda_runtime_api.h>
 
 #include <array>
@@ -32,8 +32,6 @@
 #include <optional>
 #include <random>
 #include <utility>
-
-#include "tensorrt_llm/common/NvInferRuntime.h"
 namespace tensorrt_llm::kernels {
 // Change to following declarations must sync with lora.h in public repo
 class LoraImpl;
@@ -86,6 +84,63 @@ struct LoraParams {
 };
 
 namespace cutlass_kernels {
+  static inline size_t pad_to_multiple_of_16(size_t const& input)
+  {
+      static constexpr int ALIGNMENT = 16;
+      return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+  }
+  
+  class CubKeyValueSorter
+  {
+  public:
+      CubKeyValueSorter();
+  
+      CubKeyValueSorter(int const num_experts_per_node);
+  
+      void updateNumExperts(int const num_experts_per_node);
+  
+      static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node);
+  
+      void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out, int const* values_in,
+          int* values_out, size_t const num_key_value_pairs, cudaStream_t stream);
+  
+  private:
+      static int expertsToBits(int experts);
+      int num_experts_;
+      int num_bits_;
+  };
+  
+  struct ActivationParams
+  {
+      ActivationType activation_type;
+      float const* swiglu_alpha = nullptr;
+      float const* swiglu_beta = nullptr;
+      float const* swiglu_limit = nullptr;
+  
+      explicit ActivationParams(ActivationType activation_type)
+          : activation_type(activation_type)
+      {
+          TLLM_CHECK_WITH_INFO(activation_type != ActivationType::SwigluBias,
+              "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
+      }
+  
+      ActivationParams(
+          ActivationType activation_type, float const* swiglu_alpha, float const* swiglu_beta, float const* swiglu_limit)
+          : activation_type(activation_type)
+          , swiglu_alpha(swiglu_alpha)
+          , swiglu_beta(swiglu_beta)
+          , swiglu_limit(swiglu_limit)
+      {
+      }
+  
+      // TODO Port everything properly and get rid of these implicit conversions
+      operator ActivationType() const
+      {
+          return activation_type;
+      }
+  };
+  
+
 /**
  * \brief Describes what parallelism mode the MoE is using
  *
@@ -384,7 +439,7 @@ class CutlassMoeFCRunnerInterface {
   virtual void runMoe(void const* input_activations, void const* input_sf,
                       int const* token_selected_experts, float const* token_final_scales,
                       void const* fc1_expert_weights, void const* fc1_expert_biases,
-                      ActivationType fc1_activation_type, void const* fc2_expert_weights,
+                      ActivationParams fc1_activation_type, void const* fc2_expert_weights,
                       void const* fc2_expert_biases, QuantParams quant_params,
                       int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
                       int const num_experts, int const experts_per_token, char* workspace_ptr,
@@ -406,7 +461,7 @@ class CutlassMoeFCRunnerInterface {
                      QuantParams quant_params, int64_t const num_rows,
                      int64_t const expanded_num_rows, int64_t const hidden_size,
                      int64_t const inter_size, int const num_experts_per_node,
-                     ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+                     ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
                      bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
                      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
                      int* num_active_experts_per, int* active_expert_global_ids) = 0;
@@ -478,10 +533,18 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       tensorrt_llm::kernels::fp8_blockscale_gemm::CutlassFp8BlockScaleGemmRunnerInterface;
   using ScaleBiasType = BackBoneType;
   using Self = CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType>;
+
+#if defined(ENABLE_BF16)
+  static constexpr bool use_wfp4a16
+      = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
+
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
-      (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
-      !std::is_same_v<WeightType, cutlass::uint4b_t>;
+      (std::is_same_v<T, __nv_fp8_e4m3> ||
+       std::is_same_v<T, __nv_fp8_e5m2>)&&!std::is_same_v<WeightType, cutlass::uint4b_t>;
   static constexpr bool use_w4afp8 =
       std::is_same_v<WeightType, cutlass::uint4b_t> && std::is_same_v<T, __nv_fp8_e4m3>;
   static_assert(!std::is_same_v<BackBoneType, __nv_fp8_e4m3>,
@@ -492,6 +555,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   static constexpr bool use_fp8 = false;
   static constexpr bool use_w4afp8 = false;
 #endif
+  static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 #if defined(ENABLE_FP4)
   static constexpr bool act_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
   static constexpr bool weight_fp4 = std::is_same_v<WeightType, __nv_fp4_e2m1>;
@@ -551,7 +615,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   void runMoe(void const* input_activations, void const* input_sf,
               int const* token_selected_experts, float const* token_final_scales,
               void const* fc1_expert_weights, void const* fc1_expert_biases,
-              ActivationType fc1_activation_type, void const* fc2_expert_weights,
+              ActivationParams fc1_activation_type, void const* fc2_expert_weights,
               void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows,
               int64_t const hidden_size, int64_t const inter_size, int const num_experts,
               int const experts_per_token, char* workspace_ptr, void* final_output,
@@ -578,7 +642,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
       TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
       int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-      int64_t const inter_size, int const num_experts_per_node, ActivationType fc1_activation_type,
+      int64_t const inter_size, int const num_experts_per_node, ActivationParams fc1_activation_type,
       float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream,
       cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
       int* num_active_experts_per, int* active_expert_global_ids);
@@ -615,7 +679,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
              TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat,
              QuantParams quant_params, int64_t const num_rows, int64_t const expanded_num_rows,
              int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-             ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+             ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
              bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
              cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
              int* num_active_experts_per, int* active_expert_global_ids) override {
@@ -721,7 +785,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
  private:
   std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
   setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
-                                ActivationType fc1_activation_type, int64_t hidden_size,
+                                ActivationParams fc1_activation_type, int64_t hidden_size,
                                 int64_t inter_size, int64_t num_experts_per_node,
                                 void const* input_activations_void,
                                 TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
@@ -778,7 +842,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
 
   bool mayHaveFinalizeFused() const {
     return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90 &&
-           !use_deterministic_hopper_reduce_ && !use_w4afp8;
+           !use_deterministic_hopper_reduce_ && !use_w4_groupwise;
   }
 
   // TODO: This should eventually take the quant params to give more flexibility
@@ -811,7 +875,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
       float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows,
       int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-      ActivationType fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
+      ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream);
 
   static void BlockScaleFC2(
       DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index 56bf35b79..c2526ae69 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -73,16 +73,15 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   /////////////////////////////////////////////////////////////////////////////////////////////////
 
   // A matrix configuration
-  // using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
-  using ElementA = cutlass::float_e4m3_t;
+  using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
   using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
   constexpr int AlignmentA =
       128 / cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of elements
                                                     // (up to 16 bytes)
 
   // B matrix configuration
-  // using ElementB = typename TllmToCutlassTypeAdapter<WeightType>::type;
-  using ElementB = typename cutlass::int4b_t;
+  using ElementB_ = typename TllmToCutlassTypeAdapter<WeightType>::type;
+  using ElementB = std::conditional_t<std::is_same_v<WeightType, cutlass::uint4b_t>, cutlass::int4b_t, ElementB_>;
   using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
   constexpr int AlignmentB =
       128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
@@ -97,9 +96,13 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   using StrideB = cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
 
   // Scale configuration
-  constexpr int PackedScalesNum = get<2>(CTAShape{}) / 128;
-  using ElementScalePacked =
-      cutlass::Array<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA, PackedScalesNum>;
+  constexpr bool use_wfp4a16 = std::is_same_v<ElementB, cutlass::float_e2m1_t>;
+  constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size
+                                         : cutlass::gemm::collective::detail::int4_group_size;
+  constexpr int PackedScalesNum = get<2>(CTAShape{}) / group_size;
+  using ElementScale = std::conditional_t<use_wfp4a16, cutlass::float_ue8m0_t,
+      TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA>;
+  using ElementScalePacked = cutlass::Array<ElementScale, PackedScalesNum>;
   using LayoutScale = cutlass::layout::RowMajor;
 
   // C/D matrix configuration
@@ -164,20 +167,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   Args arguments;
 
   decltype(arguments.epilogue.thread) fusion_args;
-  fusion_args.alpha = 0;
+  fusion_args.alpha = use_wfp4a16 ? 1 : 0;
   fusion_args.beta = 0;
   fusion_args.alpha_ptr = nullptr;
   fusion_args.beta_ptr = nullptr;
-  fusion_args.alpha_ptr_array = inputs.alpha_scales;
+  fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales;
   fusion_args.beta_ptr_array = nullptr;
   // One alpha and beta per each group
-  fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
-  fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+  fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
+  fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
 
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
   hw_info.sm_count = sm_count_;
 
+  assert(group_size == int(inputs.groupwise_quant_group_size));
   if (workspace_size != nullptr) {
     const Args args{
         cutlass::gemm::GemmUniversalMode::kGrouped,
@@ -185,7 +189,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
         {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
          reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
          reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-         hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+         hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
         {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
          hopper_inputs.stride_c, reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
          hopper_inputs.default_epilogue.stride_d},
@@ -200,7 +204,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
       {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
        reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
        reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-       hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+       hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
       {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c), hopper_inputs.stride_c,
        reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
        hopper_inputs.default_epilogue.stride_d},
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
new file mode 100644
index 000000000..c0b9159db
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels
+{
+#ifdef ENABLE_BF16
+template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>;
+#endif
+} // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
new file mode 100644
index 000000000..1da91c2de
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels
+{
+template class MoeGemmRunner<half, __nv_fp4_e2m1, half>;
+}
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
index 2c5dde525..a00b71f8a 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -96,6 +96,7 @@ struct genericMoeGemmKernelLauncher {
 
     static_assert(cutlass::platform::is_same<T, WeightType>::value ||
                   cutlass::platform::is_same<WeightType, uint8_t>::value ||
+                  cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value ||
                   cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value);
 
     static_assert(arch::kMinComputeCapability < 90,
@@ -550,7 +551,7 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getAmpereConfigs(int sm
       weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_hopper | fp8_only_flag);
 
   if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() ||
-      (use_w4afp8 && sm != 89)) {
+      (use_w4afp8 && sm != 89) || use_wfp4a16) {
     return {};
   }
 
@@ -630,18 +631,16 @@ int MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getSM() const {
 // currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsFusedGatedActivation(
-    bool is_gated_activation, int gemm_n, int gemm_k) const {
+  ActivationType activation_type, int gemm_n, int gemm_k) const {
   constexpr bool ENABLE_FUSED_GATED_ACTIVATION = true;
-  return is_gated_activation && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> &&
-         !use_fp8 && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) &&
-         ENABLE_FUSED_GATED_ACTIVATION;
+  return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu) && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8 && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::isFusedGatedActivation(
-    cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n,
+    cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n,
     int gemm_k) const {
-  return supportsFusedGatedActivation(is_gated_activation, gemm_n, gemm_k) &&
+  return supportsFusedGatedActivation(activation_type, gemm_n, gemm_k) &&
          !gemm_config.is_tma_warp_specialized;
 }
 
@@ -673,22 +672,37 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                        "Hopper configuration provided for non-Hopper architecture");
 
   if (sm_ >= 75 && sm_ < 80) {
-    dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
-        inputs, multi_processor_count_);
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
+    {
+        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
+            inputs, multi_processor_count_);
+    }
+    else
+    {
+        TLLM_THROW("FP4 data type is not supported on SM < 90");
+    }
   } else if (sm_ >= 80 && sm_ < 90) {
-    if constexpr (use_fp8 || use_w4afp8) {
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
+    {
+        if constexpr (use_fp8 || use_w4afp8)
+        {
+
 #if defined(ENABLE_FP8)
-      static_assert(
-          !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-          "FP8 GEMM Output not supported");
+        static_assert(
+            !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+            "FP8 GEMM Output not supported");
 #endif
-
       TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
       dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
           inputs, multi_processor_count_);
-    } else {
+      }
+      else
+      {
       dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
           inputs, multi_processor_count_);
+      }
+    } else {
+      TLLM_THROW("FP4 data type is not supported on SM < 90");
     }
   } else if (sm_ >= 90) {
     // For SM120+ FP8 MoE, redirect to SM89 (Ada) FP8 kernel implementations.
@@ -702,7 +716,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 
     if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
                       T, WeightType, EpilogueTag>() &&
-                  !use_w4afp8) {
+                  !use_w4_groupwise) {
       // We allow both tma warp specialized and SM80 configurations to coexist because for some
       // cases with small numbers of tokens SM80 is faster. We check here to see which is selected
       if (inputs.gemm_config.sm_version >= 90) {
@@ -744,25 +758,40 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 #if defined(ENABLE_FP8)
     // Hopper finegrained INT4 WS grouped GEMM
     if constexpr (use_w4afp8) {
-      if (inputs.gemm_config.is_tma_warp_specialized) {
-        // EpilogueTag is ignored
-        if (inputs.k % 512 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 4>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else if (inputs.k % 256 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 2>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else if (inputs.k % 128 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 1>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else {
-          TLLM_THROW("Invalid GEMM K size %d", (int)inputs.k);
-        }
-        return;
-      };
+      TLLM_CHECK_WITH_INFO(
+        inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization");
+    // EpilogueTag is ignored
+    if (inputs.k % 512 == 0)
+    {
+    sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+    cutlass_extensions::EpilogueOpDefault, 4>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+}
+else if (inputs.k % 256 == 0)
+{
+sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+    cutlass_extensions::EpilogueOpDefault, 2>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+}
+else if (inputs.k % 128 == 0)
+{
+sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+    cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+}
+else
+{
+TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);
+}
+return;
+}
+
+if constexpr (use_wfp4a16)
+{
+TLLM_CHECK_WITH_INFO(
+inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization");
+// EpilogueTag is ignored
+sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
+return;
+
     }
 #endif
 
@@ -810,7 +839,7 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getMaxWorkspaceS
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspaceSize(
     int num_experts) const {
-  if constexpr (use_w4afp8) {
+  if constexpr (use_w4_groupwise) {
     return calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput<T, WeightType, OutputType>(
         num_experts, multi_processor_count_);
   }
@@ -819,7 +848,7 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspace
   }
   if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
                     T, WeightType>() &&
-                !use_w4afp8) {
+                !use_w4afp8 && !use_wfp4a16) {
     auto configs = getTmaWarpSpecializedConfigs(sm_);
     auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE;
     if constexpr (use_wfp4afp4) {
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 21be2eba3..722a49292 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -159,9 +159,12 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.
 
-  constexpr int Ktile = 128 * PackedScalesNum / sizeof(T);
-  TLLM_CHECK(sizeof(T) == 1);
+  constexpr int Ntile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 64 : 128;
+  constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 128 : 128 * PackedScalesNum / sizeof(T);
+  TLLM_CHECK(sizeof(T) == (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 2 : 1);
 
+
+  using _Ntile = Int<Ntile>;
   using _Ktile = Int<Ktile>;
   switch (inputs.gemm_config.tile_config_sm90) {
     case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
@@ -181,7 +184,7 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
       break;
     case tkc::CutlassTileConfigSM90::CtaShape64x128x128B:
       sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag,
-                                                Shape<_64, _128, _Ktile>>(
+                                                Shape<_64, _Ntile, _Ktile>>(
           inputs, hopper_inputs, sm_count_, workspace_size);
       break;
     // case tkc::CutlassTileConfigSM90::CtaShape64x256x128B:
@@ -240,12 +243,15 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
 template <typename T, typename WeightType, typename OutputType>
 size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_count_) {
   size_t count = 0;
+  constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 256 : 512;
+  using _Ktile = Int<Ktile>;
+
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   GroupedGemmInput<T, WeightType, OutputType, OutputType> inputs{};
   inputs.num_experts = num_experts;
   sm90_generic_mixed_moe_gemm_kernelLauncher<
       T, WeightType, OutputType, tensorrt_llm::cutlass_extensions::EpilogueOpDefault,
-      Shape<_128, _64, _512>, Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+      Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative,
       cutlass::epilogue::TmaWarpSpecializedCooperative,
       cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
       inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count);
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
index 890ebdd5b..76e6659c4 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -67,7 +67,10 @@ constexpr bool isValidHopperMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
   return (cutlass::platform::is_same<T, WeightType>::value ||
           (cutlass::platform::is_same<cutlass::uint4b_t, WeightType>::value &&
-           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+            cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
+            || (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value
+                && !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+
 #ifdef ENABLE_FP4
          && !cutlass::platform::is_same<T, __nv_fp4_e2m1>::value
 #endif
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
index c1025e1e9..f4bc1d39f 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -253,10 +253,11 @@ __global__ void perTokenQuantization(QuantT* dst, T const* src, int64_t const nu
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// FP4 Quantization
+// FP4/MXFP8 Quantization
 
 constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
 constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+constexpr int CVT_ELTS_PER_THREAD = 8;
 constexpr int CVT_FP4_THREADS_PER_WARP = 32;
 constexpr int CVT_FP8_TO_FP4_ELTS_PER_THREAD = 16;
 
@@ -352,6 +353,25 @@ inline __device__ uint64_t fp32_vec_to_e2m1(float2 (&array)[8]) {
 #endif
 }
 
+// Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
+inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4])
+{
+    union
+    {
+        uint64_t val;
+        __nv_fp8x2_e4m3 elts[4];
+    } u;
+
+    static_assert(sizeof(u.val) == sizeof(u.elts), "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
+
+    u.elts[0] = __nv_fp8x2_e4m3(array[0]);
+    u.elts[1] = __nv_fp8x2_e4m3(array[1]);
+    u.elts[2] = __nv_fp8x2_e4m3(array[2]);
+    u.elts[3] = __nv_fp8x2_e4m3(array[3]);
+    return u.val;
+}
+
+
 // Fast reciprocal.
 inline __device__ float reciprocal_approximate_ftz(float a) {
   float b;
@@ -359,11 +379,18 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
+__device__ __forceinline__ float exp2f_rcp(uint8_t exp)
+{
+    constexpr uint32_t FP32_EXPONENT_BIAS = 127;
+    return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
+}
+
+
 // Define a 16 bytes packed data type.
 template <class Type>
 struct PackedVec {
   typename TypeConverter<Type>::Type elts[4];
-  static_assert(sizeof(elts) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+  static_assert(sizeof(elts) == sizeof(Type) * CVT_ELTS_PER_THREAD,
                 "Vector size should match the number of elements per thread.");
 };
 
@@ -374,86 +401,6 @@ struct PackedVec<__nv_fp8_e4m3> {
                 "Vector size should match the number of elements per thread.");
 };
 
-// Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
-inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4]) {
-  union {
-    uint64_t val;
-    __nv_fp8x2_e4m3 elts[4];
-  } u;
-
-  static_assert(sizeof(u.val) == sizeof(u.elts),
-                "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
-
-  u.elts[0] = __nv_fp8x2_e4m3(array[0]);
-  u.elts[1] = __nv_fp8x2_e4m3(array[1]);
-  u.elts[2] = __nv_fp8x2_e4m3(array[2]);
-  u.elts[3] = __nv_fp8x2_e4m3(array[3]);
-  return u.val;
-}
-
-// Quantizes the provided PackedVec into the uint64_t output
-template <class Type, int SF_VEC_SIZE>
-__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = cuda_abs(vec.elts[0]);
-
-// Local maximum value.
-#pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
-  }
-
-  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-  // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
-  localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
-    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
-  }
-  // Get the final absolute maximum values.
-  float vecMax = float(cuda_max(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of mxfp8).
-  float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  __nv_fp8_e8m0 tmpSFVal;
-  tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-  float SFValueNarrow = static_cast<float>(tmpSFVal);
-  fp8SFVal = tmpSFVal.__x;
-  // Get the output scale (reciprocal of the SFValue).
-  float outputScale = SFValue != 0.f ? reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
-
-#pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
-
-  // Convert to e4m3 values.
-  uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
-
-  // Write the e4m3 values to global memory.
-  return e4m3Vec;
-#else
-  return 0;
-#endif
-}
-
 // Quantizes the provided PackedVec into the uint32_t output
 template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal, uint8_t* SFout) {
@@ -463,11 +410,11 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 
 // Local maximum value.
 #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 1; i < CVT_ELTS_PER_THREAD  / 2; i++) {
     localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
   }
 
-  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD ;
   // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
   localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
   if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
@@ -476,31 +423,34 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   // Get the final absolute maximum values.
   float vecMax = float(cuda_max(localMax.x, localMax.y));
 
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  float SFValueNarrow;
   // 8 bits representation of the SF.
   uint8_t fp8SFVal;
+  float outputScale;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
     __nv_fp8_e8m0 tmp;
-    tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    SFValueNarrow = static_cast<float>(tmp);
+        // Scale the max value to the range of E2m1.
+        vecMax *= reciprocal_approximate_ftz(6.0f);
+        tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
+
     fp8SFVal = tmp.__x;
+    outputScale = vecMax != 0 ? exp2f_rcp(fp8SFVal) : 0.0f;
   } else {
+            // Get the SF (max value of the vector / max value of e2m1).
+        // maximum value of e2m1 = 6.0.
+        // TODO: use half as compute data type.
+        auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
     fp8SFVal = tmp.__x;
-    SFValueNarrow = static_cast<float>(tmp);
+    SFValue = static_cast<float>(tmp);
+    // Get the output scale.
+    // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal))
+    outputScale = vecMax != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
+
   }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0
-          ? reciprocal_approximate_ftz(SFValueNarrow * reciprocal_approximate_ftz(SFScaleVal))
-          : 0.0f;
+
 
   if (SFout) {
     // Write the SF to global memory (STG.8).
@@ -508,10 +458,10 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   }
 
   // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+  float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
 
 #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
     if constexpr (std::is_same_v<Type, half>) {
       fp2Vals[i] = __half22float2(vec.elts[i]);
     } else {
@@ -568,23 +518,24 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   // maximum value of e2m1 = 6.0.
   // TODO: use half as compute data type.
   float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  float SFValueNarrow;
   // 8 bits representation of the SF.
   uint8_t fp8SFVal;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
     __nv_fp8_e8m0 tmp;
     tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    SFValue = static_cast<float>(tmp);
+    SFValueNarrow = static_cast<float>(tmp);
     fp8SFVal = tmp.__x;
   } else {
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
     fp8SFVal = tmp.__x;
-    SFValue = static_cast<float>(tmp);
+    SFValueNarrow = static_cast<float>(tmp);
   }
   // Get the output scale.
   // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-  float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValue) : 0.0f;
+  float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
 
   if (SFout) {
     // Write the SF to global memory (STG.8).
@@ -611,10 +562,80 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 #endif
 }
 
-template <int SF_VEC_SIZE>
+// Quantizes the provided PackedVec into the uint64_t output
+template <class Type, int SF_VEC_SIZE>
+__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+    // Get absolute maximum values among the local 8 values.
+    auto localMax = cuda_abs(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+    for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++)
+    {
+        localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
+    }
+
+    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
+    // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
+    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+    if constexpr (CVT_NUM_THREADS_PER_SF == 4)
+    {
+        localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
+    }
+    // Get the final absolute maximum values.
+    float vecMax = float(cuda_max(localMax.x, localMax.y));
+
+    // Get the SF (max value of the vector / max value of mxfp8).
+    float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
+    // 8 bits representation of the SF.
+    uint8_t fp8SFVal;
+    // Write the SF to global memory (STG.8).
+    __nv_fp8_e8m0 tmpSFVal;
+    tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+    SFValue = static_cast<float>(tmpSFVal);
+    fp8SFVal = tmpSFVal.__x;
+    // Get the output scale (reciprocal of the SFValue).
+    float outputScale = vecMax != 0.f ? reciprocal_approximate_ftz(SFValue) : 0.0f;
+
+    if (SFout)
+    {
+        // Write the SF to global memory (STG.8).
+        *SFout = fp8SFVal;
+    }
+
+    // Convert the input to float.
+    float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
+    {
+        if constexpr (std::is_same_v<Type, half>)
+        {
+            fp2Vals[i] = __half22float2(vec.elts[i]);
+        }
+        else
+        {
+            fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+        }
+        fp2Vals[i].x *= outputScale;
+        fp2Vals[i].y *= outputScale;
+    }
+
+    // Convert to e4m3 values.
+    uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
+
+    // Write the e4m3 values to global memory.
+    return e4m3Vec;
+#else
+    return 0;
+#endif
+}
+
 inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> batchIdx, int mIdx,
                                                            int kIdx, std::optional<int> numRows,
-                                                           int numCols) {
+                                                           int numColVecs) {
   // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
   // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
 
@@ -635,9 +656,10 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
   int32_t kTileIdx = (kIdx / 4);
   int64_t kTileStride = 32 * outerMStride;  // 512
 
-  // SF vector size 16. We round the "numCols" up to a multiple of 64.
-  int factor = SF_VEC_SIZE * 4;
-  int32_t numKTiles = (numCols + factor - 1) / factor;
+    // SF vector size 16 or 32. We round the "numCols" up to a multiple of 64 or 128.
+    // It is the same as rounding the "numColVecs" up to a multiple of 4.
+    int32_t numKTiles = (numColVecs + 4 - 1) / 4;
+
   int32_t mTileIdx = mIdx / (32 * 4);
   int64_t mTileStride = numKTiles * kTileStride;
 
@@ -653,70 +675,31 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
   return SFOffset;
 }
 
-template <int SF_VEC_SIZE>
-inline __device__ __host__ int64_t get_sf_out_offset_8x4(std::optional<int> batchIdx, int mIdx,
-                                                         int kIdx, std::optional<int> numRows,
-                                                         int numCols) {
-  // SF layout [numMTiles, numKTiles, 8 (mTile), 4(kTile)]
-  // --> index [mTileIdx, kTileIdx, innerMIdx, innerKIdx]
-
-  // batched tensor
-  // SF layout [numBTiles, numMTiles, numKTiles, 8 (mTile), 4(kTile)]
-  // --> index [bTileIdx, mTileIdx, kTileIdx, innerMIdx, innerKIdx]
-  const int32_t mTile = 8;
-  int32_t innerKIdx = (kIdx % 4);
-  int64_t innerKStride = 1;
-
-  int32_t innerMIdx = (mIdx % mTile);
-  int64_t mStride = 4 * innerKStride;
-
-  int32_t kTileIdx = (kIdx / 4);
-  int64_t kTileStride = mTile * mStride;
-
-  int factor = SF_VEC_SIZE * 4;
-  int32_t numKTiles = (numCols + factor - 1) / factor;
-  int32_t mTileIdx = mIdx / mTile;
-  int64_t mTileStride = numKTiles * kTileStride;
-
-  int32_t numMTiles = (numRows.value_or(0) + 8 - 1) / 8;
-  int64_t bTileStride = numMTiles * mTileStride;
-
-  int64_t SFOffset = batchIdx.value_or(0) * bTileStride + mTileIdx * mTileStride +
-                     kTileIdx * kTileStride + innerMIdx * mStride + innerKIdx * innerKStride;
-
-  return SFOffset;
-}
-
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF, int SF_VEC_SIZE>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
-                                                       int colIdx, std::optional<int> numRows,
-                                                       int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+template <class SFType, int CVT_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
+                                                       int colVecIdx, std::optional<int> numRows,
+                                                       int numColVecs, SFType* SFout,
+                                                       QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 4);
+static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 || CVT_NUM_THREADS_PER_SF == 4);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED_128x4 ||
-        layout == FP4QuantizationSFLayout::SWIZZLED_8x4) {
+  if (threadIdx.x % CVT_NUM_THREADS_PER_SF  == 0) {
+    if (layout == QuantizationSFLayout::SWIZZLED) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
-      int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+      int32_t kIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
       int32_t mIdx = rowIdx;
 
-      auto SFOffset =
-          layout == FP4QuantizationSFLayout::SWIZZLED_128x4
-              ? get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdx, mIdx, kIdx, numRows, numCols)
-              : get_sf_out_offset_8x4<SF_VEC_SIZE>(batchIdx, mIdx, kIdx, numRows, numCols);
+      auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numColVecs);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
-      int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+      int32_t KTileIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
 
-      int32_t numKTiles = numCols / SF_VEC_SIZE;
+      int32_t numKTiles = numColVecs;
       int64_t mTileStride = numKTiles;
 
       int64_t BTileStride = numRows.value_or(0) * mTileStride;
@@ -731,278 +714,118 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   return nullptr;
 }
 
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4_3d(
-#else
-cvt_fp16_to_fp4_3d(
-#endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
-  using PackedVec = PackedVec<Type>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
-      SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;  // 2 or 4
-  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  asm volatile("griddepcontrol.wait;");
-  // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-           colIdx += blockDim.x) {
-        int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD) +
-                           rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-        PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-        // Get the output tensor offset.
-        // Same as inOffset because 8 elements are packed into one uint32_t.
-        int64_t outOffset = inOffset;
-        auto& out_pos = out[outOffset];
-
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-        out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-      }
-    }
-  }
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp8_to_fp4_3d(
-#else
-cvt_fp8_to_fp4_3d(
-#endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in,
-    float const* SFScale, uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<__nv_fp8_e4m3>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-           colIdx += blockDim.x) {
-        int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD) +
-                           rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-        PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-        // Get the output tensor offset.
-        // Same as inOffset because 16 elements are packed into one uint64_t.
-        int64_t outOffset = inOffset;
-        auto& out_pos = out[outOffset];
-
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-        out_pos =
-            cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-      }
-    }
-  }
-#endif
-}
-
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out,
-    uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<Type>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  asm volatile("griddepcontrol.wait;");
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
-
-      auto sf_out =
-          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-              std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols,
-              SFout, layout);
-
-      out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-    }
-  }
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp8_to_fp4(
-#else
-cvt_fp8_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in, float const* SFScale, uint64_t* out,
-    uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<__nv_fp8_e4m3>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int64_t inOffset = rowIdx * (numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 16 elements are packed into one uint64_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
-
-      auto sf_out =
-          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-              std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols,
-              SFout, layout);
-
-      out_pos =
-          cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-    }
-  }
-#endif
-}
-
 template <BlockScaleQuantizationType quantization_type, class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __global__ void
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) quantize_with_block_size(
+    __launch_bounds__(512, 4) quantize_with_block_size(
 #else
 quantize_with_block_size(
 #endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
+        int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
+        float const* SFScale, uint32_t* out, uint32_t* SFout, QuantizationSFLayout layout)
+{
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
-  // The elements per thread.
-  static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
-                                             ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
-                                             : CVT_FP4_ELTS_PER_THREAD;
+    // The elements per thread.
+    static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
+        ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
+        : CVT_ELTS_PER_THREAD;
 
-  using PackedVec = PackedVec<Type>;
-  static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD;  // 2 or 4
-  static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
+    using PackedVec = PackedVec<Type>;
+    static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD; // 2 or 4
+    static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
 
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+    // Get the global scaling factor, which will be applied to the SF.
+    // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
+    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
 
-  int numPaddedRows = numRows;
-  int numPaddedCols = numCols;
-  if (layout == FP4QuantizationSFLayout::SWIZZLED_128x4) {
-    // The number of padded rows considering 128x4 SF layout.
-    numPaddedRows = PadUpFn(numRows, 128);
-    numPaddedCols = PadUpFn(numCols, 4 * SF_VEC_SIZE);
-  } else if (layout == FP4QuantizationSFLayout::SWIZZLED_8x4) {
-    // The number of padded rows considering 8x4 SF layout.
-    numPaddedRows = PadUpFn(numRows, 8);
-    numPaddedCols = PadUpFn(numCols, 4 * SF_VEC_SIZE);
-  }
+    // Is it swizzled layout?
+    bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED;
 
-  // The number of threads in the column dimension
-  int numColThreads = numCols / ELTS_PER_THREAD;
-  int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
-
-  asm volatile("griddepcontrol.wait;");
-  // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numPaddedRows; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numPaddedColThreads; colIdx += blockDim.x) {
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        // The SF output pointer.
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-        // Set the SF padding to 0.
-        if (rowIdx >= numRows || colIdx >= numColThreads) {
-          if (sf_out != nullptr) {
-            sf_out[0] = 0x00;
-          }
-        } else {
-          int64_t inOffset =
-              static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
-          PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-          // Get the output tensor offset as a packed vector.
-          int64_t outOffset = inOffset;
-
-          // Dispatch the quantization kernel.
-          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
-            reinterpret_cast<uint32_t*>(out)[outOffset] =
-                cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) {
-            reinterpret_cast<uint64_t*>(out)[outOffset] =
-                cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal,
-                                                                          sf_out);
-          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
-            reinterpret_cast<uint64_t*>(out)[outOffset] =
-                cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
-          }
+    // The number of padded rows considering 128x4 SF layout.
+    int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
+    int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
+
+    // The number of threads in the column dimension。
+    // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
+    int numColThreads = numCols / ELTS_PER_THREAD;
+    int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
+    int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
+
+    asm volatile("griddepcontrol.wait;");
+    // Input tensor batch/row/col loops.
+    for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x)
+    {
+        for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
+        {
+            for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x)
+            {
+                std::optional<int> optionalBatchIdx = batchIdx;
+                std::optional<int> optionalNumRows = numRows;
+
+                // The SF output pointer.
+                auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+                    optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout);
+
+                // The input tensor offset.
+                int64_t inOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+                int64_t outOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
+
+                // Set the values to 0 of those are padded columns.
+                if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads)
+                {
+                    // Dispatch the quantization kernel.
+                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
+                    {
+                        reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
+                        || quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
+                    }
+                }
+
+                // Set the SF padding to 0.
+                if (rowIdx >= numRows || colIdx >= numColThreads)
+                {
+                    // Set the SF padding to 0.
+                    if (sf_out != nullptr)
+                    {
+                        sf_out[0] = 0x00;
+                    }
+                }
+                else
+                {
+                    // Load the input vector.
+                    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+
+                    // Dispatch the quantization kernel.
+                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
+                    {
+                        reinterpret_cast<uint32_t*>(out)[outOffset]
+                            = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset]
+                            = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+                    }
+                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
+                    {
+                        reinterpret_cast<uint64_t*>(out)[outOffset]
+                            = cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
+                    }
+                }
+            }
         }
-      }
     }
-  }
-  asm volatile("griddepcontrol.launch_dependents;");
+    asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-__global__ void nvfp4_block_scale_interleave_kernel(int numbatches, int numRows, int numCols,
+
+__global__ void block_scale_interleave_kernel(int numbatches, int numRows, int numCols,
                                                     uint8_t const* SFIn, uint8_t* SFOutput);
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
index 909b2bd33..1ef3a7982 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
@@ -22,7 +22,7 @@
 
 namespace tensorrt_llm {
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -31,10 +31,7 @@ enum class FP4QuantizationSFLayout {
   // The scale factor block rows map to data block rows in an interleaved pattern:
   // For a scale factor row 'i', it maps to data block row: (i % 4) * 32 + (i / 4)
   // Column 'j' in the scale factor block corresponds to scaling the j-th block in the data tensor.
-  SWIZZLED_128x4,
-
-  // Similar to SWIZZLED_128x4, but with 8x4 scale factor blocks.
-  SWIZZLED_8x4,
+  SWIZZLED,
 
   // Block scale factors are stored in linear layout (row-major). This is used in some trtllm-gen
   // kernels standard.
@@ -51,14 +48,14 @@ enum class BlockScaleQuantizationType {
 #define PadUpFn(X, Y) ((X + Y - 1) / (Y) * (Y))
 
 // totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
-inline int computeFP4SwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
+inline int64_t  computeSwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
   int paddedRow = PadUpFn(totalRow, rowSize);
   int paddedColumn = PadUpFn(totalColumn, 4);
-  return paddedRow * paddedColumn;
+  return static_cast<int64_t>(paddedRow) * paddedColumn;
 }
 
-inline int computeFP4LinearLayoutSFSize(int totalRow, int totalColumn) {
-  return totalRow * totalColumn;
+inline int64_t computeLinearLayoutSFSize(int totalRow, int totalColumn) {
+  return static_cast<int64_t>(totalRow) * totalColumn;
 }
 
 namespace kernels {
@@ -73,29 +70,20 @@ void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows
                                 float* sumPtr, tensorrt_llm::common::QuantMode quantMode,
                                 cudaStream_t stream = 0);
 
-template <typename T, int SF_VEC_SIZE = 16>
-void invokeFP4Quantization(int m, int n, T const* input, float const* globalScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
+template <typename T, int SF_VEC_SIZE>
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
+                           int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
                            int multiProcessorCount, cudaStream_t stream = 0);
 
-template <typename T, int SF_VEC_SIZE = 16>
-void invokeBatchedFP4Quantization(
-    int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
-    FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED_128x4,
-    cudaStream_t stream = 0);
-
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                     uint8_t const* SFIn, uint8_t* SFOutput,
-                                     int multiProcessorCount, cudaStream_t stream = 0);
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn,
+                                uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream = 0);
 
-void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
-                                            uint8_t* SFOutput, int multiProcessorCount,
-                                            cudaStream_t stream = 0);
+void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
+                                       int multiProcessorCount, cudaStream_t stream = 0);
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, T const* input, int64_t* output, int32_t* SFOuput,
-                             FP4QuantizationSFLayout layout, int multiProcessorCount,
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
+                             QuantizationSFLayout layout, int multiProcessorCount,
                              cudaStream_t stream = 0);
 
 }  // namespace kernels
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index 5defced2e..a47bebc33 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -100,13 +100,13 @@ float e2M1ToFloat(uint8_t value) {
 // SFMatrix. colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize
 // needed.
 int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
-                   tensorrt_llm::FP4QuantizationSFLayout layout) {
+                   tensorrt_llm::QuantizationSFLayout layout) {
   constexpr int kColumnGroup0Size = 4;
   constexpr int kRowGroup0Size = 32;
   constexpr int kRowGroup1Size = kRowGroup0Size * 4;
 
   // Swizzled layout is used as default layout.
-  if (layout == tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4) {
+  if (layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED) {
     // int paddedRow = PadUpFn(totalRow, 128);
     int paddedColumn = PadUpFn(totalColumn, 4);
 
@@ -126,7 +126,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
            rowGroupIdx * rowGroupStride;
   }
   // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
-  else if (layout == tensorrt_llm::FP4QuantizationSFLayout::LINEAR) {
+  else if (layout == tensorrt_llm::QuantizationSFLayout::LINEAR) {
     // no padding needed. totalColumn is multiple of kVecSize.
     return rowIdx * totalColumn + colIdx;
   } else {
@@ -137,7 +137,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
 // Interleave (and possibly pad) the weights block scaling factor.
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Return: num_experts * pad_up(rows, 128) * pad_up(cols, 4)
-at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
+at::Tensor BlockScaleInterleave(at::Tensor const& blockScale) {
   bool is_cuda = blockScale.device().is_cuda();
   if (is_cuda) {
     CHECK_INPUT_TYPE(blockScale, SF_DTYPE);
@@ -151,7 +151,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
   auto rows = blockScaleShape.size() == 3 ? blockScaleShape[1] : blockScaleShape[0];
   auto cols = blockScaleShape.size() == 3 ? blockScaleShape[2] : blockScaleShape[1];
 
-  auto expert_out_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(rows, cols);
+  auto expert_out_size = tensorrt_llm::computeSwizzledLayoutSFSize(rows, cols);
   auto rows_padded = PadUpFn(rows, 128);
   auto cols_padded = PadUpFn(cols, 4);
   TORCH_CHECK(expert_out_size == rows_padded * cols_padded,
@@ -163,7 +163,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
   if (is_cuda) {
     const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
     auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-    tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleave(
+    tensorrt_llm::kernels::invokeBlockScaleInterleave(
         num_experts, rows, rows_padded, cols, cols_padded, blockScale.data_ptr<uint8_t>(),
         static_cast<uint8_t*>(interleavedBlockScale.data_ptr()), smCount, stream);
   } else {
@@ -179,7 +179,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
             sf_ori = blockScalePtr[cIdx];
           }
           int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4);
+                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED);
           interleavedBlockScalePtr[sf_index] = sf_ori;
         }
       }
@@ -193,7 +193,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Note: rows and cols are the dimensions of the original unswizzled SFMatrix, so reshape input
 // before passing into this function! Return: The same shape as blockScale
-at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
+at::Tensor BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
   bool is_cuda = blockScale.device().is_cuda();
   if (is_cuda) {
     CHECK_INPUT_TYPE(blockScale, SF_DTYPE);
@@ -215,7 +215,7 @@ at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
   if (is_cuda) {
     const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
     auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-    tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleaveReverse(
+    tensorrt_llm::kernels::invokeBlockScaleInterleaveReverse(
         num_experts, rows, cols, blockScale.data_ptr<uint8_t>(),
         static_cast<uint8_t*>(reversedBlockScale.data_ptr()), smCount, stream);
   } else {
@@ -225,7 +225,7 @@ at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
       for (int rIdx = 0; rIdx < rows; ++rIdx) {
         for (int cIdx = 0; cIdx < cols; ++cIdx) {
           int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4);
+                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED);
           identity[eIdx * expert_out_size + sf_index] = std::array<int, 3>{eIdx, rIdx, cIdx};
         }
       }
@@ -267,7 +267,7 @@ at::Tensor E2M1AndUFP8SFScaleToFloat(at::Tensor valueE2M1, at::Tensor scaleFP8SF
       uint8_t* scaleFP8SFPtr = scaleFP8SF.data_ptr<uint8_t>();
       uint8_t fp8Scale =
           scaleFP8SFPtr[computeSFIndex(vIdx, group, packedShape[0], groupsPerHiddenDim,
-                                       tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4)];
+                                       tensorrt_llm::QuantizationSFLayout::SWIZZLED)];
       int scale = fp8Scale;
       if (sfType == 0) {
         scale -= 127;
@@ -292,7 +292,7 @@ at::Tensor E2M1AndUFP8SFScaleToFloat(at::Tensor valueE2M1, at::Tensor scaleFP8SF
 
 // Used by the (fp16 -> int4) quant layer + int4 gemm network.
 at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8SF,
-                                       at::Tensor globalScale, int64_t sfVecSize, int64_t sfType,
+  std::optional<at::Tensor> globalScale, int64_t sfVecSize, int64_t sfType,
                                        bool isSfSwizzledLayout = true) {
   CHECK_CPU_INPUT(valueE2M1, FLOAT4_E2M1X2);
   CHECK_CPU_INPUT(scaleFP8SF, SF_DTYPE);
@@ -303,16 +303,23 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
   at::Tensor floatTensor = at::zeros({packedShape[0], packedShape[1] * 2},
                                      at::dtype(at::ScalarType::Float).requires_grad(false));
 
-  CHECK_CPU_INPUT(globalScale, at::ScalarType::Float);
-  float globalScaleVal = globalScale.data_ptr<float>()[0];
+  // CHECK_CPU_INPUT(globalScale, at::ScalarType::Float);
+  float globalScaleVal{1.0f};
+  if (sfType == 1)
+  {
+      TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfType is 1.");
+      // CHECK_CPU_INPUT(globalScale.value(), at::kFloat32);
+      globalScaleVal = globalScale->data_ptr<float>()[0];
+  }
+
 
   int hiddenDim = packedShape[1] * 2;
   int packedFp4HiddenDim = hiddenDim / 2;
   int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                         : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t vIdx = 0; vIdx < static_cast<size_t>(packedShape[0]); ++vIdx) {
     for (int group = 0; group < groupsPerHiddenDim; ++group) {
@@ -347,7 +354,7 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
 }  // namespace torch_ext
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
-  m.def("nvfp4_block_scale_interleave", &torch_ext::NVFP4BlockScaleInterleave);
-  m.def("nvfp4_block_scale_interleave_reverse", &torch_ext::NVFP4BlockScaleInterleaveReverse);
+  m.def("block_scale_interleave", &torch_ext::BlockScaleInterleave);
+  m.def("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse);
   m.def("e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
 }
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
index c231e2347..0ac21710d 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -35,17 +35,31 @@ namespace torch_ext {
 // mxfp4: sfVecSize = 32, sfUseUE8M0 = true
 // alignment: sfVecSize
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
-// linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts. returns
+// linear layout. See QuantizationSFLayout enum for more details about the two layouts. returns
 // self_fp4, self_block_scale_factors self_fp4: [M, K / 2], FLOAT4_E2M1X2 self_block_scale_factors:
 // ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4, SF_DTYPE (UE4M3 or UE8M0)
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-                                                at::Tensor const& globalScale, int64_t sfVecSize,
+  std::optional<at::Tensor> const& globalScale, int64_t sfVecSize,
                                                 bool sfUseUE8M0, bool isSfSwizzledLayout,
                                                 bool isSf8x4Layout) {
   CHECK_TH_CUDA(self);
   CHECK_CONTIGUOUS(self);
-  CHECK_INPUT_TYPE(globalScale, c10::ScalarType::Float);
-  TORCH_CHECK(sfVecSize == 16 || sfVecSize == 32, "sfVecSize can only be 16 or 32");
+  if (sfUseUE8M0)
+  {
+      TORCH_CHECK(sfVecSize == 32, "sfVecSize can only be 32, when sfUseUE8M0 is true");
+  }
+  else
+  {
+      TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfUseUE8M0 is false");
+      // CHECK_INPUT_AND_TYPE(globalScale.value(), torch::kFloat32);
+      TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16, when sfUseUE8M0 is false");
+  }
+
+  float* globalScalePtr{nullptr};
+  if (globalScale.has_value())
+  {
+      globalScalePtr = globalScale->data_ptr<float>();
+  }
 
   auto const& inputShape = self.sizes();
   auto const& rank = inputShape.size();
@@ -66,27 +80,21 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
 
   int64_t SFSize =
       isSfSwizzledLayout
-          ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
-          : tensorrt_llm::computeFP4LinearLayoutSFSize(m, k / sfVecSize);
+          ? tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
+          : tensorrt_llm::computeLinearLayoutSFSize(m, k / sfVecSize);
 
   at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, self.device(),
                                                  /* stride */ std::nullopt);  // 1D tensor
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-  auto layout = tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
-  if (isSf8x4Layout) {
-    TORCH_CHECK(isSfSwizzledLayout, "8x4layout must be swizzled layout");
-    layout = tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_8x4;
-  } else {
-    layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                                : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
-  }
+  auto layout = tensorrt_llm::QuantizationSFLayout::LINEAR;
+  layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                              : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
-#define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                 \
-  tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(                                    \
-      m, k, reinterpret_cast<T*>(self.data_ptr()), globalScale.data_ptr<float>(),                  \
-      reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                            \
+  #define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                                     \
+  tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(1, m, k, reinterpret_cast<T*>(self.data_ptr()),       \
+      globalScalePtr, reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                              \
       reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, layout, mMultiProcessorCount, \
       at::cuda::getCurrentCUDAStream(self.get_device()));
 
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
index a264f5341..43134f409 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
@@ -18,12 +18,13 @@
 #include <ATen/cuda/EmptyTensor.h>
 
 #include <cstdint>
+#include <optional>
 
 #include "tensorrt_llm/common/cudaUtils.h"
 
 namespace torch_ext {
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-                                                at::Tensor const& globalScale, int64_t sfVecSize,
+                                                std::optional<at::Tensor> const& globalScale, int64_t sfVecSize,
                                                 bool sfUseUE8M0, bool isSfSwizzledLayout,
                                                 bool isSf8x4Layout);
 }  // namespace torch_ext
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
index cb3c43d95..9ccfdd4cf 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
@@ -26,12 +26,16 @@ namespace torch_ext {
 
 // input: [M, K], fp32/fp16/bf16/fp8_quantized
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
-// linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts.
+// linear layout. See QuantizationSFLayout enum for more details about the two layouts.
 // returns
-std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout) {
+std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout, int64_t alignment) {
   CHECK_TH_CUDA(input);
   CHECK_CONTIGUOUS(input);
 
+  // Fixed SF_VEC_SIZE as 32
+  static constexpr int SF_VEC_SIZE = 32;
+  TORCH_CHECK(alignment % SF_VEC_SIZE == 0, "alignment must be divisible by SF_VEC_SIZE = 32");
+
   auto const& inputShape = input.sizes();
   auto const& rank = inputShape.size();
 
@@ -41,52 +45,49 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
     m *= inputShape[i];
   }
   auto const k = inputShape[rank - 1];
-  int32_t const sfVecSize = 32;
-  TORCH_CHECK(k % sfVecSize == 0);
+  TORCH_CHECK(k % SF_VEC_SIZE == 0, "k must be divisible by SF_VEC_SIZE = 32");
+  auto const padded_k = ((k + alignment - 1) / alignment) * alignment;
 
   std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
-  outputShape[rank - 1] = k;
+  outputShape[rank - 1] = padded_k;
 
-  at::Tensor valueFP8 =
-      at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, input.device(),
-                             /* stride */ std::nullopt);
+  at::Tensor valMxFP8
+      = at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, input.device(), /* stride */ std::nullopt);
 
-  int64_t SFSize = isSfSwizzledLayout
-                       ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize)
-                       : tensorrt_llm::computeFP4LinearLayoutSFSize(m, k / sfVecSize);
+  int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(m, padded_k / SF_VEC_SIZE)
+                                      : tensorrt_llm::computeLinearLayoutSFSize(m, padded_k / SF_VEC_SIZE);
 
-  at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(),
-                                                 /* stride */ std::nullopt);  // 1D tensor
+  at::Tensor scaleFP8SF
+      = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(), /* stride */ std::nullopt); // 1D tensor
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-  auto const layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                                         : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
-#define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                \
-  tensorrt_llm::kernels::invokeMxFP8Quantization<T>(                                   \
-      1, m, k, reinterpret_cast<T*>(input.data_ptr()),                                 \
-      reinterpret_cast<int64_t*>(valueFP8.data_ptr()),                                 \
-      reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout, mMultiProcessorCount, \
-      at::cuda::getCurrentCUDAStream(input.get_device()));
+#define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                                                \
+  tensorrt_llm::kernels::invokeMxFP8Quantization(1, m, k, padded_k, reinterpret_cast<T*>(input.data_ptr()),             \
+      reinterpret_cast<int64_t*>(valMxFP8.data_ptr()), reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout,      \
+      mMultiProcessorCount, at::cuda::getCurrentCUDAStream(input.get_device()));
 
-  if (input.scalar_type() == at::ScalarType::Half) {
-    LAUNCH_MXFP8_QUANTIZE_KERNEL(half)
-  } else if (input.scalar_type() == at::ScalarType::BFloat16) {
+  if (input.scalar_type() == at::ScalarType::Half)
+  {
+      LAUNCH_MXFP8_QUANTIZE_KERNEL(half)
+  }
+  else if (input.scalar_type() == at::ScalarType::BFloat16)
+  {
 #ifdef ENABLE_BF16
-    LAUNCH_MXFP8_QUANTIZE_KERNEL(__nv_bfloat16)
+      LAUNCH_MXFP8_QUANTIZE_KERNEL(__nv_bfloat16)
 #else
-    C10_THROW_ERROR(NotImplementedError,
-                    "BFloat16 must be enabled to quantize an bf16 tensor to mxfp8.");
+    C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to mxfp8.");
 #endif
   } else {
-    C10_THROW_ERROR(NotImplementedError,
-                    "mxfp8_quantize only supports input tensor with dtypes fp16/bf16.");
+    C10_THROW_ERROR(NotImplementedError, "mxfp8_quantize only supports input tensor with dtypes fp16/bf16.");
   }
 
 #undef LAUNCH_MXFP8_QUANTIZE_KERNEL
 
-  return {valueFP8, scaleFP8SF};
+  return {valMxFP8, scaleFP8SF};
 }
 
 inline uint8_t float_to_ue8m0(float value) {
@@ -119,14 +120,14 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize_host(at::Tensor x_fp32,
                                                 /* pinned */ true, at::MemoryFormat::Contiguous);
   int64_t sf_size =
       is_sf_swizzled_layout
-          ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(num_tokens, hidden_dim / sf_vec_size)
-          : tensorrt_llm::computeFP4LinearLayoutSFSize(num_tokens, hidden_dim / sf_vec_size);
+          ? tensorrt_llm::computeSwizzledLayoutSFSize(num_tokens, hidden_dim / sf_vec_size)
+          : tensorrt_llm::computeLinearLayoutSFSize(num_tokens, hidden_dim / sf_vec_size);
   at::Tensor scale_tensor =
       at::detail::empty_cpu({sf_size}, SF_DTYPE, /* pinned */ true, at::MemoryFormat::Contiguous);
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
@@ -175,9 +176,9 @@ at::Tensor mxfp8_dequantize_host(at::Tensor value_e4m3, at::Tensor scale_ue8m08s
   int hidden_dim = data_shape[1];
   int groups_per_hidden_dim = hidden_dim / sf_vec_size;
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
       float* float_ptr = float_tensor.data_ptr<float>() + ti * hidden_dim + group * sf_vec_size;
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
index 9bbbbc623..677ba0246 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
@@ -25,13 +25,13 @@
 namespace torch_ext {
 // colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
 inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
-                          tensorrt_llm::FP4QuantizationSFLayout layout, bool useUE8M0 = false) {
+                          tensorrt_llm::QuantizationSFLayout layout, bool useUE8M0 = false) {
   constexpr int kColumnGroup0Size = 4;
   constexpr int kRowGroup0Size = 32;
   constexpr int kRowGroup1Size = kRowGroup0Size * 4;
 
   // Swizzled layout is used as default layout.
-  if (layout == tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4) {
+  if (layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED) {
     // int paddedRow = PadUpFn(totalRow, 128);
     int paddedColumn = PadUpFn(totalColumn, 4);
 
@@ -51,7 +51,7 @@ inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
            rowGroupIdx * rowGroupStride;
   }
   // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
-  else if (layout == tensorrt_llm::FP4QuantizationSFLayout::LINEAR) {
+  else if (layout == tensorrt_llm::QuantizationSFLayout::LINEAR) {
     // no padding needed. totalColumn is multiple of kVecSize.
     return rowIdx * totalColumn + colIdx;
   } else {
@@ -64,7 +64,8 @@ inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
 // linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts.
 // returns fp8_quantized and block_scale_factors.
 std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input,
-                                                  bool is_sf_swizzled_layout = true);
+                                                  bool is_sf_swizzled_layout = true,
+                                                  int64_t alignment = 32);
 
 // x_fp32: [M, K], fp32_quantized (on the host)
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
diff --git a/csrc/trtllm_allreduce_fusion.cu b/csrc/trtllm_allreduce_fusion.cu
index d54441ec3..37672dae3 100644
--- a/csrc/trtllm_allreduce_fusion.cu
+++ b/csrc/trtllm_allreduce_fusion.cu
@@ -71,8 +71,8 @@ void trtllm_allreduce_fusion(
                               : nullptr;
     params.use_oneshot = use_oneshot;
     params.layout = layout_code.has_value()
-                        ? static_cast<FP4QuantizationSFLayout>(layout_code.value())
-                        : FP4QuantizationSFLayout::SWIZZLED;
+                        ? static_cast<QuantizationSFLayout>(layout_code.value())
+                        : QuantizationSFLayout::SWIZZLED;
     params.pattern = static_cast<AllReduceFusionPattern>(pattern_code);
     params.trigger_completion_at_end = trigger_completion_at_end;
     params.stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
index 5a7ee9e91..d4c279b96 100644
--- a/csrc/trtllm_fused_moe_kernel_launcher.cu
+++ b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -816,7 +816,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
 
   std::optional<at::Tensor> gemm1_output_scale = std::nullopt;
   if (dtype_act == btg::Dtype::E2m1 || dtype_act == btg::Dtype::MxE4m3) {
-    int64_t sf_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(max_num_padded_tokens,
+    int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens,
                                                                    intermediate_size / sf_vec_size);
     gemm1_output_scale = at::detail::empty_cuda({sf_size}, at::ScalarType::Float8_e4m3fn,
                                                 hidden_states.device(), std::nullopt);
@@ -876,7 +876,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
 
     TORCH_CHECK(hidden_states_scale.value().dim() == 1, "hidden_states_scale must be 1D.");
     TORCH_CHECK(hidden_states_scale.value().sizes()[0] ==
-                    tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens,
+                    tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens,
                                                                args.hidden_size / sf_vec_size),
                 "hidden_states_scale has incorrect size");
   }
diff --git a/csrc/trtllm_moe_allreduce_fusion.cu b/csrc/trtllm_moe_allreduce_fusion.cu
index a5278047c..13826ce24 100644
--- a/csrc/trtllm_moe_allreduce_fusion.cu
+++ b/csrc/trtllm_moe_allreduce_fusion.cu
@@ -62,8 +62,8 @@ void trtllm_moe_allreduce_fusion(
         params.rms_eps = static_cast<float>(rms_eps);
         params.scale_factor = static_cast<float>(scale_factor);
         params.layout = layout_code.has_value()
-                            ? static_cast<FP4QuantizationSFLayout>(layout_code.value())
-                            : FP4QuantizationSFLayout::SWIZZLED_128x4;
+                            ? static_cast<QuantizationSFLayout>(layout_code.value())
+                            : QuantizationSFLayout::SWIZZLED;
         params.stream = stream;
 
         params.moe_reduction_device_num_experts = moe_reduction_device_num_experts;
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index f4e974950..b96828110 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -52,10 +52,12 @@
     SfLayout,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    nvfp4_block_scale_interleave,
+    block_scale_interleave,
     nvfp4_quantize,
     shuffle_matrix_a,
     shuffle_matrix_sf_a,
+    mxfp4_quantize,
+    mxfp4_dequantize,
 )
 from .fp8_quantization import mxfp8_dequantize_host, mxfp8_quantize
 from .fused_moe import (
diff --git a/flashinfer/comm/__init__.py b/flashinfer/comm/__init__.py
index a5ab1baba..f7ae3754a 100644
--- a/flashinfer/comm/__init__.py
+++ b/flashinfer/comm/__init__.py
@@ -5,7 +5,7 @@
 from .trtllm_ar import AllReduceFusionPattern as AllReduceFusionPattern
 from .trtllm_ar import AllReduceStrategyConfig as AllReduceStrategyConfig
 from .trtllm_ar import AllReduceStrategyType as AllReduceStrategyType
-from .trtllm_ar import FP4QuantizationSFLayout as FP4QuantizationSFLayout
+from .trtllm_ar import QuantizationSFLayout as QuantizationSFLayout
 from .trtllm_ar import (
     compute_fp4_swizzled_layout_sf_size as compute_fp4_swizzled_layout_sf_size,
 )
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
index 38b00415b..27850f1ef 100644
--- a/flashinfer/comm/trtllm_ar.py
+++ b/flashinfer/comm/trtllm_ar.py
@@ -79,7 +79,7 @@ class AllReduceFusionPattern:
     kARResidualRMSNormOutFP4Quant = 5
 
 
-class FP4QuantizationSFLayout:
+class QuantizationSFLayout:
     # Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
     # blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
     # values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -262,7 +262,7 @@ def trtllm_allreduce_fusion(
         rms_gamma: Optional[torch.Tensor],
         rms_eps: Optional[float],
         scale_factor: Optional[Union[torch.Tensor, float]],
-        layout_code: Optional[FP4QuantizationSFLayout],
+        layout_code: Optional[QuantizationSFLayout],
     ) -> None:
         module.trtllm_allreduce_fusion(
             allreduce_in,
@@ -329,7 +329,7 @@ def trtllm_moe_allreduce_fusion(
         moe_reduction_scale_input: torch.Tensor,
         moe_reduction_active_experts_token_input: torch.Tensor,
         moe_reduction_token_input: torch.Tensor,
-        layout_code: Optional[FP4QuantizationSFLayout],
+        layout_code: Optional[QuantizationSFLayout],
         moe_allreduce_out: Optional[torch.Tensor],
         residual_out: Optional[torch.Tensor],
         norm_out: Optional[torch.Tensor],
@@ -788,7 +788,7 @@ def trtllm_allreduce_fusion(
     rms_gamma: Optional[torch.Tensor],
     rms_eps: Optional[float],
     scale_factor: Optional[Union[torch.Tensor, float]],
-    layout_code: Optional[FP4QuantizationSFLayout],
+    layout_code: Optional[QuantizationSFLayout],
 ) -> None:
     """
     Parameters:
@@ -886,7 +886,7 @@ def trtllm_moe_allreduce_fusion(
     moe_reduction_scale_input: torch.Tensor,
     moe_reduction_active_experts_token_input: torch.Tensor,
     moe_reduction_token_input: torch.Tensor,
-    layout_code: Optional[FP4QuantizationSFLayout],
+    layout_code: Optional[QuantizationSFLayout],
     moe_allreduce_out: Optional[torch.Tensor],
     residual_out: Optional[torch.Tensor],
     norm_out: Optional[torch.Tensor],
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 306414efb..6c5e2dcfe 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -146,10 +146,10 @@ def _fake_fp4_quantize_sm100(
         )
 
     @register_custom_op(
-        "flashinfer::nvfp4_block_scale_interleave_sm100",
+        "flashinfer::block_scale_interleave_sm100",
         mutates_args=("",),
     )
-    def nvfp4_block_scale_interleave_sm100(
+    def block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         """Swizzle block scale tensor for FP4 format.
@@ -160,12 +160,12 @@ def nvfp4_block_scale_interleave_sm100(
         Returns:
             torch.Tensor: output tensor for swizzled block scale with dtype uint8.
         """
-        return module.nvfp4_block_scale_interleave(
+        return module.block_scale_interleave(
             unswizzled_sf,
         )
 
-    @register_fake_op("flashinfer::nvfp4_block_scale_interleave_sm100")
-    def _fake_nvfp4_block_scale_interleave_sm100(
+    @register_fake_op("flashinfer::block_scale_interleave_sm100")
+    def _fake_block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         return unswizzled_sf.new_empty(
@@ -225,7 +225,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
     # Register the module
     return SimpleNamespace(
         fp4_quantize_sm100=fp4_quantize_sm100,
-        nvfp4_block_scale_interleave_sm100=nvfp4_block_scale_interleave_sm100,
+        block_scale_interleave_sm100=block_scale_interleave_sm100,
         e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
     )
 
@@ -287,7 +287,7 @@ def fp4_quantize(
     return x_q, sf
 
 
-def nvfp4_block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
+def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     """Swizzle block scale tensor for FP4 format.
 
     This function swizzles the block scale tensor to optimize memory access patterns
@@ -306,7 +306,7 @@ def nvfp4_block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     assert (
         unswizzled_sf.dtype == torch.uint8
     ), f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
-    return get_fp4_quantization_sm100_module().nvfp4_block_scale_interleave_sm100(
+    return get_fp4_quantization_sm100_module().block_scale_interleave_sm100(
         unswizzled_sf,
     )
 
@@ -377,7 +377,7 @@ def shuffle_matrix_sf_a(
     w_shuffled = input_tensor[row_indices.to(input_tensor.device)]
 
     # 128x4
-    return nvfp4_block_scale_interleave(w_shuffled)
+    return block_scale_interleave(w_shuffled)
 
 
 class SfLayout(Enum):
@@ -438,3 +438,20 @@ def nvfp4_quantize(
         )
 
     return a_fp4, a_sf
+
+
+def mxfp4_quantize(a):
+    a_global_sf = (448 * 6) / a.float().abs().nan_to_num().max()
+    a_fp4, a_sf = fp4_quantize(a.cuda(), a_global_sf.cuda(), 32, True, True)
+    return a_fp4, a_sf
+
+
+def mxfp4_dequantize(a_fp4, a_sf):
+    return e2m1_and_ufp8sf_scale_to_float(
+        a_fp4.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8).reshape(-1),
+        torch.tensor([1.0], device=a_fp4.device),
+        32,
+        0,
+        True,
+    )
\ No newline at end of file
diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
index 8d3adc37b..7be124288 100644
--- a/flashinfer/fp8_quantization.py
+++ b/flashinfer/fp8_quantization.py
@@ -50,6 +50,7 @@ def get_mxfp8_quantization_sm100_module():
     def mxfp8_quantize_sm100(
         input: torch.Tensor,
         is_sf_swizzled_layout: bool = True,
+        alignment: int = 32,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Quantize input tensor to MxFP8 format.
 
@@ -71,12 +72,14 @@ def mxfp8_quantize_sm100(
             return module.mxfp8_quantize(
                 input,
                 is_sf_swizzled_layout,
+                alignment,
             )
 
     @register_fake_op("flashinfer::mxfp8_quantize_sm100")
     def _fake_mxfp8_quantize_sm100(
         input: torch.Tensor,
         is_sf_swizzled_layout: bool = True,
+        alignment: int = 32,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         m, k = input.shape
         return (
@@ -127,6 +130,7 @@ def _fake_mxfp8_dequantize_host_sm100(
 def mxfp8_quantize(
     input: torch.Tensor,
     is_sf_swizzled_layout: bool = True,
+    alignment: int = 32,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Quantize input tensor to MxFP8 format.
 
@@ -148,6 +152,7 @@ def mxfp8_quantize(
     x_q, sf = get_mxfp8_quantization_sm100_module().mxfp8_quantize_sm100(
         input,
         is_sf_swizzled_layout,
+        alignment,
     )
     sf = sf.reshape((-1, input.shape[-1] // sf_vec_size))
     return x_q, sf
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 52cb4e73a..607ec71d9 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -30,7 +30,7 @@
     TunableRunner,
     TuningConfig,
 )
-from ..fp4_quantization import nvfp4_block_scale_interleave
+from ..fp4_quantization import block_scale_interleave
 from ..jit import JitSpec
 from ..jit import env as jit_env
 from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags
@@ -212,6 +212,10 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu",
             jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu",
+            jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_stub.cu",
             jit_env.FLASHINFER_CSRC_DIR
             / "fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu",
@@ -413,6 +417,9 @@ def cutlass_fused_moe_sm100(
         output_dtype: torch.dtype,
         quant_scales: List[torch.Tensor],
         input_sf: Optional[torch.Tensor] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
         tp_size: int = 1,
         tp_rank: int = 0,
         ep_size: int = 1,
@@ -492,6 +499,9 @@ def cutlass_fused_moe_sm100(
             fc2_expert_biases,
             quant_scales,
             input_sf,
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
             tp_size,
             tp_rank,
             ep_size,
@@ -518,6 +528,9 @@ def _fake_cutlass_fused_moe_sm100(
         output_dtype: torch.dtype,
         quant_scales: List[torch.Tensor],
         input_sf: Optional[torch.Tensor] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
         tp_size: int = 1,
         tp_rank: int = 0,
         ep_size: int = 1,
@@ -566,6 +579,9 @@ def cutlass_fused_moe(
     fc1_expert_biases: Optional[torch.Tensor] = None,
     fc2_expert_biases: Optional[torch.Tensor] = None,
     input_sf: Optional[torch.Tensor] = None,
+    swiglu_alpha: Optional[torch.Tensor] = None,
+    swiglu_beta: Optional[torch.Tensor] = None,
+    swiglu_limit: Optional[torch.Tensor] = None,
     tp_size: int = 1,
     tp_rank: int = 0,
     ep_size: int = 1,
@@ -705,8 +721,6 @@ def cutlass_fused_moe(
         )
     if min_latency_mode:
         raise NotImplementedError("min latency mode not yet implemented for Blackwell.")
-    if use_mxfp8_act_scaling:
-        raise NotImplementedError("mxfp8 not yet implemented for Blackwell.")
 
     num_rows = input.shape[0]
     if min_latency_mode:
@@ -733,6 +747,9 @@ def cutlass_fused_moe(
         output_dtype,
         quant_scales,
         input_sf,
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
         tp_size,
         tp_rank,
         ep_size,
diff --git a/flashinfer/fused_moe/utils.py b/flashinfer/fused_moe/utils.py
index dff7426de..832d2e8e0 100644
--- a/flashinfer/fused_moe/utils.py
+++ b/flashinfer/fused_moe/utils.py
@@ -118,7 +118,7 @@ def swizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: int
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave(sf)
+    return torch.ops.trtllm.block_scale_interleave(sf)
 
 
 def unswizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: int = 16):
@@ -133,7 +133,7 @@ def unswizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: in
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(sf).view(-1, sf_cols)
+    return torch.ops.trtllm.block_scale_interleave_reverse(sf).view(-1, sf_cols)
 
 
 @torch.library.custom_op("trtllm::reswizzle_sf", mutates_args=())
diff --git a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
index dc48372cc..aca4246cb 100644
--- a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
+++ b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
@@ -16,7 +16,7 @@ namespace flashinfer {
 
 namespace trtllm_allreduce_fusion {
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -488,7 +488,7 @@ template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
 __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
                                                        int colIdx, std::optional<int> numRows,
                                                        int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+                                                       QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
 
@@ -496,7 +496,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
   if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED) {
+    if (layout == QuantizationSFLayout::SWIZZLED) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
       int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
@@ -504,7 +504,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
 
       auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numCols);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
       int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
 
@@ -762,7 +762,7 @@ struct AllReduceFusionParams {
   float rms_eps;
   float* scale_factor;
   bool use_oneshot;
-  FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+  QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
   cudaStream_t stream;
   AllReduceFusionPattern pattern;
   bool trigger_completion_at_end = true;
diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
index 6037c1979..9e86e9b6d 100644
--- a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
+++ b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -374,7 +374,7 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
 }
 }  // namespace maths
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -475,7 +475,7 @@ template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
 __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
                                                        int colIdx, std::optional<int> numRows,
                                                        int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+                                                       QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
 
@@ -483,7 +483,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
   if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED) {
+    if (layout == QuantizationSFLayout::SWIZZLED) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
       int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
@@ -491,7 +491,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
 
       auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numCols);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
       int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
 
@@ -679,7 +679,7 @@ struct AllReduceFusionParams {
   float rms_eps;
   // todo(review): why float* scale_factor in trt-llm?
   float scale_factor;
-  FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+  QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
   cudaStream_t stream;
 
   // moe-allreduce output (non-fused)
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
index d52b9818b..4ad9a33e6 100644
--- a/tests/test_fp4_quantize.py
+++ b/tests/test_fp4_quantize.py
@@ -7,7 +7,7 @@
 from flashinfer import (
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    nvfp4_block_scale_interleave,
+    block_scale_interleave,
 )
 from flashinfer.utils import is_sm100a_supported
 
@@ -158,12 +158,12 @@ def test_scale_swizzling(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_nvfp4_block_scale_interleave(
+def test_block_scale_interleave(
     shape: tuple[int, int],
     seed: int,
     device: str,
 ) -> None:
-    """Test the nvfp4_block_scale_interleave function directly."""
+    """Test the block_scale_interleave function directly."""
     if not is_sm100a_supported(torch.device("cuda")):
         pytest.skip("Nvfp4 Requires compute capability of 10 or above")
     torch.set_default_device(device)
@@ -178,7 +178,7 @@ def test_nvfp4_block_scale_interleave(
     unswizzled_sf = torch.randint(0, 256, scale_shape, dtype=torch.uint8, device=device)
 
     # Test the swizzling function
-    swizzled_sf = nvfp4_block_scale_interleave(unswizzled_sf)
+    swizzled_sf = block_scale_interleave(unswizzled_sf)
 
     # Compare against the reference implementation
     ref_swizzled_sf = swizzle_sf(unswizzled_sf, m, n, sf_vec_size)
diff --git a/tests/test_trtllm_allreduce_fusion.py b/tests/test_trtllm_allreduce_fusion.py
index 3f2d08657..52b2e299b 100644
--- a/tests/test_trtllm_allreduce_fusion.py
+++ b/tests/test_trtllm_allreduce_fusion.py
@@ -45,8 +45,8 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
             comm.AllReduceFusionPattern.kARResidualRMSNormOutFP4Quant,
         ]
         swizzled_layout_codes = [
-            comm.FP4QuantizationSFLayout.LINEAR,
-            comm.FP4QuantizationSFLayout.SWIZZLED,
+            comm.QuantizationSFLayout.LINEAR,
+            comm.QuantizationSFLayout.SWIZZLED,
         ]
         launch_with_pdls = [True, False]
         use_oneshots = [True, False, None]
@@ -122,7 +122,7 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
                                     ), "hidden_dim must be divisible by SF_VEC_SIZE"
                                     if (
                                         swizzled_layout_code
-                                        == comm.FP4QuantizationSFLayout.SWIZZLED
+                                        == comm.QuantizationSFLayout.SWIZZLED
                                     ):
                                         # TODO(Yingyi): check this
                                         padded_message_size = (
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index 0577f2b4f..ea6f1ef80 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -18,12 +18,11 @@
 import torch
 from torch.nn import functional as F
 
-import flashinfer
 import flashinfer.fused_moe as fused_moe
-from flashinfer import fp4_quantize
+from flashinfer import fp4_quantize, mxfp4_quantize, mxfp8_quantize, mxfp8_dequantize_host, e2m1_and_ufp8sf_scale_to_float, mxfp4_dequantize
 
 FLOAT4_E2M1_MAX = 6.0
-FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max  
 FP8_DTYPE = torch.float8_e4m3fn
 
 
@@ -481,7 +480,6 @@ def test_moe_nvfp4(
     )
     torch.testing.assert_close(ref_output, flash_output, rtol=2e-1, atol=2e-1)
 
-
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("num_experts", EP_NUM_EXPERTS)
@@ -1027,5 +1025,124 @@ def test_moe_fp8_block_scaling(
         )
 
 
+def quant_mxfp4_batches(a, num_experts):
+    quant_a = []
+    sfs = []
+    for i in range(num_experts):
+        a_fp4, a_sf = mxfp4_quantize(a[i].cuda())
+        quant_a.append(a_fp4)
+        sfs.append(a_sf)
+
+    result_quant_a = torch.stack(quant_a)
+    result_sfs = torch.stack(sfs)
+
+    return result_quant_a, result_sfs
+
+
+def dequant_mxfp4_batches(
+    mat_fp4: torch.Tensor,
+    scale_tensor: torch.Tensor,
+):
+    num_batches = mat_fp4.size(0)
+
+    scale_tensor = scale_tensor.view(num_batches, -1)
+
+    return torch.stack([
+        mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
+        for b in range(num_batches)
+    ])
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.parametrize("otype", [torch.float16, torch.bfloat16])
+def test_moe_mxfp8_mxfp4(
+    batch_size,
+    hidden_size,
+    num_experts,
+    top_k,
+    intermediate_size,
+    otype,
+):
+    """
+    Test MoE with MXFP8 activations and MXFP4 weights.
+    Uses mxfp8_quantize for activations and fp4_quantize for weights.
+    """
+    # Skip invalid configurations
+    if top_k > num_experts:
+        pytest.skip(
+            f"top_k ({top_k}) cannot be greater than num_experts ({num_experts})"
+        )
+
+    torch.manual_seed(42)
+    e = num_experts
+    m = batch_size
+    n = intermediate_size
+    k = hidden_size
+
+    x = torch.randn(m, k, dtype=otype).cuda()
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=otype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=otype) / 10
+
+    mxfp8_x, mxfp8_x_sf = mxfp8_quantize(x, True, 32)
+
+    mxfp4_w1, mxfp4_w1_scale = quant_mxfp4_batches(w1, e)
+    mxfp4_w2, mxfp4_w2_scale = quant_mxfp4_batches(w2, e)
+
+    router_logits = torch.randn(m, e, dtype=otype).cuda() 
+    routing_weights, selected_experts = compute_routing(router_logits, top_k)
+    
+    fake_input_scale = torch.ones(e, device=x.device)
+
+    quant_scales = [
+        mxfp4_w1_scale.view(torch.int32),
+        fake_input_scale,
+        mxfp4_w2_scale.view(torch.int32),
+        fake_input_scale,
+    ]
+    
+    flash_output = torch.zeros_like(x)
+    
+    # Call cutlass_fused_moe with MXFP8 activations and MXFP4 weights
+    _ = fused_moe.cutlass_fused_moe(
+        mxfp8_x,
+        selected_experts.to(torch.int),
+        routing_weights,
+        mxfp4_w1.contiguous().view(torch.long),
+        mxfp4_w2.contiguous().view(torch.long),
+        otype,
+        quant_scales=quant_scales,
+        input_sf=mxfp8_x_sf,
+        use_mxfp8_act_scaling=True,
+        output=flash_output,
+    )
+    
+    dq_mxfp8_x = mxfp8_dequantize_host(
+        mxfp8_x.cpu().view(torch.uint8), 
+        mxfp8_x_sf.cpu().view(torch.uint8).reshape(-1), 
+        True
+    ).cuda().to(otype)
+
+    dq_mfxp4_w1 = dequant_mxfp4_batches(
+        mxfp4_w1.cpu().view(torch.uint8),
+        mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
+    ).cuda().to(otype)
+
+    dq_mfxp4_w2 = dequant_mxfp4_batches(
+        mxfp4_w2.cpu().view(torch.uint8),
+        mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
+    ).cuda().to(otype)
+
+    # Use original weights for reference computation
+    ref_output = compute_with_experts(
+        e, dq_mxfp8_x, dq_mfxp4_w1, dq_mfxp4_w2, selected_experts, routing_weights
+    )
+
+    torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_trtllm_moe_allreduce_fusion.py b/tests/test_trtllm_moe_allreduce_fusion.py
index eedeb80f5..e79db5830 100644
--- a/tests/test_trtllm_moe_allreduce_fusion.py
+++ b/tests/test_trtllm_moe_allreduce_fusion.py
@@ -46,8 +46,8 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
         candidate_active_expert_num = [8, 12, 16]
         # candidate_active_expert_num = [1]  # debug-only
         swizzled_layout_codes = [
-            comm.FP4QuantizationSFLayout.LINEAR,
-            comm.FP4QuantizationSFLayout.SWIZZLED,
+            comm.QuantizationSFLayout.LINEAR,
+            comm.QuantizationSFLayout.SWIZZLED,
         ]
         launch_with_pdls = [True, False]
 
@@ -94,7 +94,7 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                             ), "HIDDEN_SIZE must be divisible by SF_VEC_SIZE"
                             if (
                                 swizzled_layout_code
-                                == comm.FP4QuantizationSFLayout.SWIZZLED
+                                == comm.QuantizationSFLayout.SWIZZLED
                             ):
                                 padded_message_size = (
                                     comm.compute_fp4_swizzled_layout_sf_size(

From 6d2ccc2d24cf8a91d5a9e04a0a7fb299723e5057 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Tue, 5 Aug 2025 21:19:49 -0700
Subject: [PATCH 02/12] add swiglu test

---
 tests/test_trtllm_cutlass_fused_moe.py | 33 ++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index ea6f1ef80..c0b082b94 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -160,9 +160,8 @@ def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
         out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
     ).sum(dim=1)
 
-
 def compute_with_experts(
-    num_experts, x, w31_weight, w2_weight, selected_experts, routing_weights
+    num_experts, x, w31_weight, w2_weight, selected_experts, routing_weights, alpha=None, beta=None, limit=None
 ):
     results = torch.zeros_like(x)
     for expert_id in range(num_experts):
@@ -177,7 +176,17 @@ def compute_with_experts(
         w3_expert, w1_expert = torch.chunk(w31_expert, 2, dim=0)
 
         expert_inputs = x[batch_idx]
-        inter = F.silu(expert_inputs @ w1_expert.t()) * (expert_inputs @ w3_expert.t())
+        if alpha is not None and limit is not None and beta is not None:
+            # SwiGLUBias
+            x1 = expert_inputs @ w1_expert.t()
+            x1 = x1.clamp_(min=None, max=limit)
+            x1_scaled = x1 * torch.sigmoid(alpha * x1)
+            x2 = expert_inputs @ w3_expert.t()
+            x2 = x2.clamp_(min=-limit, max=limit) + beta
+            
+            inter = x1_scaled * x2
+        else:
+            inter = F.silu(expert_inputs @ w1_expert.t()) * (expert_inputs @ w3_expert.t())
         output = inter @ w2_expert.t()
         results[batch_idx] += routing_weights[batch_idx, nth_expert, None] * output
     return results.view_as(x)
@@ -1059,6 +1068,7 @@ def dequant_mxfp4_batches(
 @pytest.mark.parametrize("top_k", TOP_K_VALUES)
 @pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
 @pytest.mark.parametrize("otype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)])
 def test_moe_mxfp8_mxfp4(
     batch_size,
     hidden_size,
@@ -1066,6 +1076,9 @@ def test_moe_mxfp8_mxfp4(
     top_k,
     intermediate_size,
     otype,
+    alpha,
+    beta,
+    limit,
 ):
     """
     Test MoE with MXFP8 activations and MXFP4 weights.
@@ -1106,6 +1119,15 @@ def test_moe_mxfp8_mxfp4(
     
     flash_output = torch.zeros_like(x)
     
+    if alpha is not None and limit is not None and beta is not None:
+        alpha_t = torch.ones(e, device=x.device) * alpha
+        limit_t = torch.ones(e, device=x.device) * limit
+        beta_t = torch.ones(e, device=x.device) * beta
+    else:
+        alpha_t = None
+        limit_t = None
+        beta_t = None
+
     # Call cutlass_fused_moe with MXFP8 activations and MXFP4 weights
     _ = fused_moe.cutlass_fused_moe(
         mxfp8_x,
@@ -1114,6 +1136,9 @@ def test_moe_mxfp8_mxfp4(
         mxfp4_w1.contiguous().view(torch.long),
         mxfp4_w2.contiguous().view(torch.long),
         otype,
+        swiglu_alpha=alpha_t,
+        swiglu_limit=limit_t,
+        swiglu_beta=beta_t,
         quant_scales=quant_scales,
         input_sf=mxfp8_x_sf,
         use_mxfp8_act_scaling=True,
@@ -1138,7 +1163,7 @@ def test_moe_mxfp8_mxfp4(
 
     # Use original weights for reference computation
     ref_output = compute_with_experts(
-        e, dq_mxfp8_x, dq_mfxp4_w1, dq_mfxp4_w2, selected_experts, routing_weights
+        e, dq_mxfp8_x, dq_mfxp4_w1, dq_mfxp4_w2, selected_experts, routing_weights, alpha, beta, limit
     )
 
     torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)

From 542ca4b5049423caa013ec69225bbe6b5be0f8c1 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Tue, 5 Aug 2025 21:26:07 -0700
Subject: [PATCH 03/12] pre-commit checks

---
 .../cutlass_fused_moe_kernels.cuh             | 385 +++++++++---------
 .../flashinfer_cutlass_fused_moe_sm100_ops.cu | 184 +++++----
 csrc/nv_internal/cpp/kernels/quantization.cu  | 178 ++++----
 .../kernels/cutlass_kernels/include/common.h  |  11 +-
 .../include/moe_gemm_kernels.h                |  19 +-
 .../cutlass_kernels/include/moe_kernels.h     | 123 +++---
 .../moe_gemm_tma_ws_mixed_input_launcher.inl  |   8 +-
 .../moe_gemm/moe_gemm_kernels_bf16_fp4.cu     |   5 +-
 .../moe_gemm/moe_gemm_kernels_fp16_fp4.cu     |   3 +-
 .../moe_gemm/moe_gemm_template_dispatch.h     | 109 +++--
 ...emm_template_dispatch_tma_ws_mixed_dtype.h |  15 +-
 .../moe_tma_warp_specialized_traits.h         |   6 +-
 .../tensorrt_llm/kernels/quantization.cuh     | 359 ++++++++--------
 .../tensorrt_llm/kernels/quantization.h       |  18 +-
 csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp  |  28 +-
 .../tensorrt_llm/thop/fp4Quantize.cpp         |  40 +-
 .../tensorrt_llm/thop/fp4Quantize.h           |   6 +-
 .../tensorrt_llm/thop/fp8Quantize.cpp         |  55 +--
 csrc/trtllm_allreduce_fusion.cu               |   5 +-
 csrc/trtllm_fused_moe_kernel_launcher.cu      |   4 +-
 flashinfer/__init__.py                        |   6 +-
 flashinfer/fp4_quantization.py                |   2 +-
 tests/test_fp4_quantize.py                    |   2 +-
 tests/test_trtllm_cutlass_fused_moe.py        | 105 +++--
 24 files changed, 858 insertions(+), 818 deletions(-)

diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
index f9151bff4..35837b76a 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -962,7 +962,7 @@ __device__ auto quantizePackedFPXValue(
     TmaWarpSpecializedGroupedGemmInput::ElementSF* act_sf_flat,
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType scaling_type) {
   constexpr bool is_fp8 = std::is_same_v<QuantizedType, __nv_fp8_e4m3>;
-      static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD;
+  static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD;
   // Quantize the input to FP4
   static_assert(std::is_same_v<GemmOutputType, __nv_bfloat16> ||
                 std::is_same_v<GemmOutputType, half>);
@@ -979,9 +979,11 @@ __device__ auto quantizePackedFPXValue(
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
-      std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
+  auto sf_out =
+      cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
+          std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert,
+          QuantizationSFLayout::SWIZZLED);
 
   // Do the conversion and set the output and scaling factor
   auto func = [&]() {
@@ -1018,16 +1020,18 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
-      std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert, QuantizationSFLayout::SWIZZLED);
+  auto sf_out =
+      cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
+          std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert,
+          QuantizationSFLayout::SWIZZLED);
   if (sf_out) {
     if (input_sf) {
-      auto const sf_in =
-          cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
-              std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
-              num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-              QuantizationSFLayout::SWIZZLED);
+      auto const sf_in = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
+                                                     NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+          num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+          QuantizationSFLayout::SWIZZLED);
       *sf_out = *sf_in;
     } else {
       *sf_out = 0x00;
@@ -1123,12 +1127,13 @@ __device__ void computeTmaWarpSpecializedInputStrides(
   if (layout_info.int4_groupwise_params.enabled) {
     layout_info.int4_groupwise_params.stride_s_a[out_idx] = cutlass::make_cute_packed_stride(
         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::StrideSFA{},
-        cute::make_shape(gemm_n,
-          gemm_k
-              / (layout_info.int4_groupwise_params.use_wfp4a16
-                      ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
-                      : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size),
-          1));
+        cute::make_shape(
+            gemm_n,
+            gemm_k /
+                (layout_info.int4_groupwise_params.use_wfp4a16
+                     ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+                     : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size),
+            1));
   }
 }
 
@@ -1151,14 +1156,15 @@ __device__ void computeTmaWarpSpecializedInputPointers(
         safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
   }
   if (layout_info.int4_groupwise_params.enabled) {
-        // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2 bytes
-        layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(w4a8_weight_scale,
-          expert
-              * (gemm_n * gemm_k
-                  / (layout_info.int4_groupwise_params.use_wfp4a16
-                          ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2
-                          : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size)));
-
+    // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2
+    // bytes
+    layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(
+        w4a8_weight_scale,
+        expert *
+            (gemm_n * gemm_k /
+             (layout_info.int4_groupwise_params.use_wfp4a16
+                  ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2
+                  : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size)));
   }
 }
 
@@ -1986,62 +1992,57 @@ void finalizeMoeRoutingKernelLauncher(
 
 // ============================== Activation Adaptors =================================
 template <template <class> class ActFn>
-struct IdentityAdaptor
-{
-    constexpr static bool IS_GLU = false;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& x) const
-    {
-        ActFn<T> fn{};
-        return fn(x);
-    }
+struct IdentityAdaptor {
+  constexpr static bool IS_GLU = false;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& x) const {
+    ActFn<T> fn{};
+    return fn(x);
+  }
 };
 
 template <template <class> class ActFn>
-struct GLUAdaptor
-{
-    constexpr static bool IS_GLU = true;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& gate, T const& linear) const
-    {
-        ActFn<T> fn{};
-        return fn(gate) * linear;
-    }
+struct GLUAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    ActFn<T> fn{};
+    return fn(gate) * linear;
+  }
 };
 
-struct SwigluBiasAdaptor
-{
-    constexpr static bool IS_GLU = true;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& gate, T const& linear) const
-    {
-        cutlass::epilogue::thread::Sigmoid<T> fn{};
-        T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
-        T gate_clamped = cutlass::minimum<T>{}(gate, limit);
-        return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
-    }
+struct SwigluBiasAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    cutlass::epilogue::thread::Sigmoid<T> fn{};
+    T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+    T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+    return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+  }
 };
 
-
 // ============================== Gated Activation =================================
 constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256;
 
 template <class ActivationOutputType, class GemmOutputType, class ActFn>
 __global__ void doGatedActivationKernel(ActivationOutputType* output,
                                         GemmOutputType const* gemm_result,
-                                        int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_experts_per_node, ActivationParams activation_type) {
+                                        int64_t const* expert_first_token_offset,
+                                        int64_t inter_size, int64_t num_experts_per_node,
+                                        ActivationParams activation_type) {
   int64_t const tid = threadIdx.x;
   int64_t const token = blockIdx.x;
   if (token >= expert_first_token_offset[num_experts_per_node]) {
@@ -2067,49 +2068,50 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output,
   float gate_alpha = 1.0f;
   float gate_bias = 0.0f;
   float gate_limit = std::numeric_limits<float>::infinity();
-  if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit)
-  {
-      int expert
-          = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, (int64_t) token + 1) - 1;
-      gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
-      gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f;
-      gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert]
-                                                : std::numeric_limits<float>::infinity();
+  if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit) {
+    int expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node,
+                                             (int64_t)token + 1) -
+                 1;
+    gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
+    gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f;
+    gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert]
+                                              : std::numeric_limits<float>::infinity();
   }
 
-
   ActFn fn{};
   fn.alpha = gate_alpha;
   fn.beta = gate_bias;
   fn.limit = gate_limit;
-  for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride)
-  {
-      auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-      // BF16 isn't supported, use FP32 for activation function
-      auto gate_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
-      auto gate_act = fn(gate_value, linear_value);
-      output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(gate_act);
+  for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
+    auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+    // BF16 isn't supported, use FP32 for activation function
+    auto gate_value =
+        arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
+    auto gate_act = fn(gate_value, linear_value);
+    output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(gate_act);
   }
 }
 
 template <typename ActivationOutputType, typename GemmOutputType>
 void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_result,
-                       int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_tokens, int64_t num_experts_per_node,
+                       int64_t const* expert_first_token_offset, int64_t inter_size,
+                       int64_t num_tokens, int64_t num_experts_per_node,
                        ActivationParams activation_type, cudaStream_t stream) {
   int64_t const blocks = num_tokens;
   int64_t const threads = ACTIVATION_THREADS_PER_BLOCK;
 
   auto* fn = (activation_type == ActivationType::Swiglu)
-  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::SiLu>>
-  : activation_type == ActivationType::Geglu
-  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, GLUAdaptor<cutlass::epilogue::thread::GELU>>
-  : activation_type == ActivationType::SwigluBias
-  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, SwigluBiasAdaptor>
-  : nullptr;
-TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type");
-fn<<<blocks, threads, 0, stream>>>(
-  output, gemm_result, expert_first_token_offset, inter_size, num_experts_per_node, activation_type);
-
+                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
+                                            GLUAdaptor<cutlass::epilogue::thread::SiLu>>
+             : activation_type == ActivationType::Geglu
+                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
+                                            GLUAdaptor<cutlass::epilogue::thread::GELU>>
+             : activation_type == ActivationType::SwigluBias
+                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, SwigluBiasAdaptor>
+                 : nullptr;
+  TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type");
+  fn<<<blocks, threads, 0, stream>>>(output, gemm_result, expert_first_token_offset, inter_size,
+                                     num_experts_per_node, activation_type);
 }
 
 // ============================== Activation =================================
@@ -2119,9 +2121,10 @@ template <class T, class GemmOutputType, class ScaleBiasType, class ActFn,
 __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                                    float const* fp8_quant, ScaleBiasType const* bias_ptr,
                                    bool bias_is_broadcast, int64_t const* expert_first_token_offset,
-                                   int num_experts_per_node, int64_t inter_size, float const* fc2_act_global_scale, bool use_per_expert_act_scale,
-                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, ActivationParams activation_params)
-                                {
+                                   int num_experts_per_node, int64_t inter_size,
+                                   float const* fc2_act_global_scale, bool use_per_expert_act_scale,
+                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat,
+                                   ActivationParams activation_params) {
 #ifdef ENABLE_FP4
   constexpr bool IsNVFP4 =
       std::is_same_v<T, __nv_fp4_e2m1> &&
@@ -2139,7 +2142,6 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
   size_t gated_size_mul = IsGated ? 2 : 1;
   size_t gated_off = IsGated ? inter_size : 0;
 
-
   constexpr int64_t VecSize = IsNVFP4
                                   ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
                                   : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
@@ -2170,18 +2172,17 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     float gate_alpha = 1.0f;
     float gate_beta = 0.0f;
     float gate_limit = std::numeric_limits<float>::infinity();
-    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale || activation_params.swiglu_alpha
-        || activation_params.swiglu_beta || activation_params.swiglu_limit)
-{
+    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale ||
+        activation_params.swiglu_alpha || activation_params.swiglu_beta ||
+        activation_params.swiglu_limit) {
       // TODO this is almost certainly faster as a linear scan
       expert =
           findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) -
           1;
-          gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
-          gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
-          gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
-                                                      : std::numeric_limits<float>::infinity();
-    
+      gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
+      gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
+      gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
+                                                  : std::numeric_limits<float>::infinity();
     }
 
     size_t act_scale_idx = use_per_expert_act_scale ? expert : 0;
@@ -2228,20 +2229,18 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 
       auto gate_act = [&]() {
         if constexpr (IsGated) {
-          auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-          if (bias_ptr_vec)
-          {
-              linear_value = linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+          auto linear_value =
+              arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+          if (bias_ptr_vec) {
+            linear_value =
+                linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
           }
           return fn(fc1_value, linear_value);
 
+        } else {
+          return fn(fc1_value);
         }
-        else
-        {
-            return fn(fc1_value);
-        }
-    }();
-
+      }();
 
       auto post_act_val = gate_act * quant_scale;
 
@@ -2351,22 +2350,27 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   auto fn = [&]() {
     auto fn = [&](auto block_scaling_type) {
       auto fn_list = std::array{
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::GELU>,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Gelu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
                               decltype(block_scaling_type)::value>,  // Relu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Silu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::SiLu>,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              GLUAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Swiglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, GLUAdaptor<cutlass::epilogue::thread::GELU>,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              GLUAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Geglu
-                              &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
-                              decltype(block_scaling_type)::value>, // SwigluBias
-                          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                              decltype(block_scaling_type)::value>,  // SwigluBias
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
                               IdentityAdaptor<cutlass::epilogue::thread::Identity>,
-                              decltype(block_scaling_type)::value> // Identity
-          
+                              decltype(block_scaling_type)::value>  // Identity
+
       };
       return fn_list[static_cast<int>(activation_type.activation_type)];
     };
@@ -2404,9 +2408,9 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   config.numAttrs = 1;
   config.attrs = attrs;
   cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast,
-                     expert_first_token_offset, num_experts_per_node, inter_size, quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale,
+                     expert_first_token_offset, num_experts_per_node, inter_size,
+                     quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale,
                      fc2_act_sf_flat, activation_type);
-             
 }
 
 // ============================== Lora Add Bias =================================
@@ -3037,8 +3041,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
 
   bool const using_tma_ws_gemm1 = gemm_runner.isTmaWarpSpecialized(config);
   bool const is_gated_activation = isGatedActivation(fc1_activation_type);
-  bool const use_ampere_activation_fusion =
-      gemm_runner.isFusedGatedActivation(config, fc1_activation_type.activation_type, inter_size, hidden_size);
+  bool const use_ampere_activation_fusion = gemm_runner.isFusedGatedActivation(
+      config, fc1_activation_type.activation_type, inter_size, hidden_size);
   size_t const fc1_out_size =
       ((!use_ampere_activation_fusion) && is_gated_activation) ? inter_size * 2 : inter_size;
 
@@ -3592,7 +3596,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     bool use_deepseek_fp8_block_scale, bool min_latency_mode,
     MoeMinLatencyParams& min_latency_params, cudaStream_t stream) {
   static constexpr bool int_scales_required = std::is_same<WeightType, uint8_t>::value ||
-                                              std::is_same<WeightType, cutlass::uint4b_t>::value || use_wfp4a16;
+                                              std::is_same<WeightType, cutlass::uint4b_t>::value ||
+                                              use_wfp4a16;
   static constexpr bool fp8_scales_required = std::is_same<WeightType, __nv_fp8_e4m3>::value ||
                                               std::is_same<WeightType, __nv_fp8_e5m2>::value;
 
@@ -3703,7 +3708,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                          "Scales are ignored for fp32/fp16/bf16 but received quant scale for FC2");
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales &&
+                 !use_wfp4a16;
   int const num_experts_per_node = full_num_experts / parallelism_config.ep_size;
 
   configureWsPtrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts_per_node,
@@ -3919,7 +3925,6 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
   layout_info1.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
   layout_info2.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
 
-
   layout_info1.fpX_block_scaling_type = getScalingType();
   layout_info2.fpX_block_scaling_type = getScalingType();
 
@@ -3964,7 +3969,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         UnfusedGemmOutputType* output1, UnfusedGemmOutputType* output2,
         int const* num_active_experts_per, int const* active_expert_global_ids, int start_expert,
         cudaStream_t stream) {
-          TLLM_CHECK_WITH_INFO(!use_w4_groupwise, "W4AFP8 and WFP4A16 are not supported in low latency mode");
+  TLLM_CHECK_WITH_INFO(!use_w4_groupwise,
+                       "W4AFP8 and WFP4A16 are not supported in low latency mode");
 
   // Always nullptr
   layout_info1.ptr_c = nullptr;
@@ -3990,7 +3996,6 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
   layout_info1.int4_groupwise_params.use_wfp4a16 = false;
   layout_info2.int4_groupwise_params.use_wfp4a16 = false;
 
-
   int const threads = std::min(1024, num_experts);
   int const blocks = (num_experts + threads - 1) / threads;
 
@@ -4020,7 +4025,7 @@ template <class T, class WeightType, class OutputType, class InputType, class Ba
 std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
 CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
-      ActivationParams  fc1_activation_type, int64_t hidden_size,
+                                  ActivationParams fc1_activation_type, int64_t hidden_size,
                                   int64_t inter_size, int64_t num_experts_per_node,
                                   void const* input_activations_void,
                                   TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
@@ -4038,7 +4043,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     return std::make_pair(gemm1_tma_ws_input, gemm2_tma_ws_input);
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales &&
+                 !use_wfp4a16;
 
   bool is_gated_activation = isGatedActivation(fc1_activation_type);
   int64_t const fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size;
@@ -4074,7 +4080,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
     bool apply_bias = parallelism_config.tp_rank == 0;
     bool using_hopper_fused_finalize = !use_deterministic_hopper_reduce_ &&
-                                       gemm2_config_->sm_version == 90 && !use_w4_groupwise && !use_lora;
+                                       gemm2_config_->sm_version == 90 && !use_w4_groupwise &&
+                                       !use_lora;
     if (using_hopper_fused_finalize) {
       assert(min_latency_mode == false);
       gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
@@ -4338,8 +4345,9 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   bool is_fp4_w_quant = mWType == nvinfer1::DataType::kFP4 || mWType == nvinfer1::DataType::kINT64;
   bool is_w4afp8_quant = is_int_groupwise_w_quant && is_fp8_act_quant;
   // bool is_wfp4afp8_quant = is_fp4_w_quant && is_fp8_act_quant;
-  bool is_wfp4a16_quant = (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) && mWType == nvinfer1::DataType::kUINT8;
-
+  bool is_wfp4a16_quant =
+      (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+      mWType == nvinfer1::DataType::kUINT8;
 
   // Int sizes
   size_t quant_1_size = is_int_w_quant ? fc1_out_size * num_experts_per_node * dtype_bytes : 0;
@@ -4347,7 +4355,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   if (is_int_w_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes;
-  } else if (is_int_groupwise_w_quant  || is_wfp4a16_quant) {
+  } else if (is_int_groupwise_w_quant || is_wfp4a16_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes * hidden_size / mGroupSize;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes * inter_size / mGroupSize;
   }
@@ -4382,7 +4390,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         TmaWarpSpecializedGroupedGemmInput::workspaceSize(num_experts_per_node, mScalingType) *
         (NUM_ROUTING_SAMPLES + 1);
 
-    if (is_w4afp8_quant  || is_wfp4a16_quant) {
+    if (is_w4afp8_quant || is_wfp4a16_quant) {
       quant_3_size = 0;
       quant_4_size = 0;
     }
@@ -4400,7 +4408,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
       sizeof(TmaWarpSpecializedGroupedGemmInput::ElementSF);
   size_t const fp4_act_scale_flat_size = std::max(fc1_fp4_act_scale_size, fc2_fp4_act_scale_size);
 
-  size_t w4a8_alpha_size = (is_w4afp8_quant || is_wfp4a16_quant) ? num_experts_per_node * sizeof(float) : 0;
+  size_t w4a8_alpha_size =
+      (is_w4afp8_quant || is_wfp4a16_quant) ? num_experts_per_node * sizeof(float) : 0;
   size_t alpha_scale_ptr_array_size = num_experts_per_node * sizeof(float**);
   size_t gemm_workspace_size = mInterface->getGemmWorkspaceSize(num_experts_per_node);
 
@@ -4429,7 +4438,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   size_t active_expert_global_ids_size =
       mMinLatencyMode ? mNumExpertsPerNode * sizeof(int) * NUM_ROUTING_SAMPLES : 0;
 
-  bool is_swiglu_bias = mActivationType == ActivationType::SwigluBias && mGemmToProfile == GemmToProfile::GEMM_1;
+  bool is_swiglu_bias =
+      mActivationType == ActivationType::SwigluBias && mGemmToProfile == GemmToProfile::GEMM_1;
   size_t swiglu_alpha_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
   size_t swiglu_beta_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
   size_t swiglu_limit_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
@@ -4562,18 +4572,16 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr
   GET_WS_PTR(float const*, w4a8_alpha);
 #undef GET_WS_PTR
 
-if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4
-  || mWType == nvinfer1::DataType::kUINT8)
-&& mGroupSize < 0)
-{
+  if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4 ||
+       mWType == nvinfer1::DataType::kUINT8) &&
+      mGroupSize < 0) {
     TLLM_CHECK(quant_1 && quant_2);
     mQuantParams = QuantParams::Int(quant_1, quant_2);
   } else if (mWType == nvinfer1::DataType::kINT4 || mWType == nvinfer1::DataType::kUINT8) {
     TLLM_CHECK(quant_1 && quant_2);
-    if (mDType == nvinfer1::DataType::kFP8
-      || (mWType == nvinfer1::DataType::kUINT8
-          && (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)))
-{
+    if (mDType == nvinfer1::DataType::kFP8 ||
+        (mWType == nvinfer1::DataType::kUINT8 &&
+         (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16))) {
       TLLM_CHECK(w4a8_alpha);
       mQuantParams = QuantParams::GroupWise(mGroupSize, quant_1, quant_2, nullptr, nullptr, quant_3,
                                             quant_4, w4a8_alpha, w4a8_alpha);
@@ -4672,9 +4680,10 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
 
       bool apply_bias = true;
       bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
-      bool use_wfp4a16 = ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16)
-      && mWType == nvinfer1::DataType::kUINT8);
-  bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+      bool use_wfp4a16 =
+          ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+           mWType == nvinfer1::DataType::kUINT8);
+      bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 
       bool using_fused_finalize = !mInterface->use_deterministic_hopper_reduce_ && mSM == 90 &&
                                   !mMinLatencyMode && !use_w4_groupwise;
@@ -4785,7 +4794,6 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
   GET_WS_PTR(float*, swiglu_beta);
   GET_WS_PTR(float*, swiglu_limit);
 
-
 #undef GET_WS_PTR_OFFSET
 #undef GET_WS_PTR
 
@@ -4796,36 +4804,36 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
 
   mInterface->is_profiler = true;
   if (mGemmToProfile == GemmToProfile::GEMM_1) {
-    mInterface->gemm1(input,                                             //
-                      output,                                            //
-                      intermediate,                                      //
-                      expert_first_token_offset,                         //
-                      tma_ws_input_template,                             //
-                      weights_sel,                                       //
-                      bias,                                              //
-                      expert_first_token_offset + num_experts_per_node,  //
-                      mQuantParams.wo.fc1_weight_scales,                 //
-                      mQuantParams.fp8.dequant_fc1,                      //
-                      mQuantParams.fp8_mxfp4.fc2.act_global_scale
-                          ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
-                          : mQuantParams.fp8.quant_fc2,        //
-                      fp4_act_scale_flat,                      //
-                      fp4_act_scale_flat,                      //
-                      mQuantParams,                            //
-                      original_num_tokens,                     //
-                      expanded_num_tokens,                     //
-                      mExpertHiddenSize,                       //
-                      mExpertInterSize,                        //
-                      num_experts_per_node,                    //
-                      ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),                         //
-                      alpha_scale_ptr_array,                   //
-                      !mUseLora,                               //
-                      /*use_deepseek_fp8_block_scale=*/false,  //
-                      stream,                                  //
-                      tactic,                                  //
-                      mMinLatencyMode,                         //
-                      num_active_experts_per_node,             //
-                      active_expert_global_ids);               //
+    mInterface->gemm1(
+        input,                                             //
+        output,                                            //
+        intermediate,                                      //
+        expert_first_token_offset,                         //
+        tma_ws_input_template,                             //
+        weights_sel,                                       //
+        bias,                                              //
+        expert_first_token_offset + num_experts_per_node,  //
+        mQuantParams.wo.fc1_weight_scales,                 //
+        mQuantParams.fp8.dequant_fc1,                      //
+        mQuantParams.fp8_mxfp4.fc2.act_global_scale ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
+                                                    : mQuantParams.fp8.quant_fc2,    //
+        fp4_act_scale_flat,                                                          //
+        fp4_act_scale_flat,                                                          //
+        mQuantParams,                                                                //
+        original_num_tokens,                                                         //
+        expanded_num_tokens,                                                         //
+        mExpertHiddenSize,                                                           //
+        mExpertInterSize,                                                            //
+        num_experts_per_node,                                                        //
+        ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),  //
+        alpha_scale_ptr_array,                                                       //
+        !mUseLora,                                                                   //
+        /*use_deepseek_fp8_block_scale=*/false,                                      //
+        stream,                                                                      //
+        tactic,                                                                      //
+        mMinLatencyMode,                                                             //
+        num_active_experts_per_node,                                                 //
+        active_expert_global_ids);                                                   //
   } else {
     TLLM_CHECK(mGemmToProfile == GemmToProfile::GEMM_2);
     mInterface->gemm2(input,                                           //
@@ -4869,4 +4877,3 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
 }
 
 }  // namespace tensorrt_llm::kernels::cutlass_kernels
-
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
index 3ed70d356..8b85c08ec 100644
--- a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
+++ b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
@@ -47,7 +47,8 @@ namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
 using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
 // Always use public header as it is just utility functions and types
-using TmaWarpSpecializedGroupedGemmInput = tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
+using TmaWarpSpecializedGroupedGemmInput =
+    tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput;
 using profiler_backend = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE::GemmProfilerBackend;
 
 class FusedMoeRunner : public torch::CustomClassHolder {
@@ -60,16 +61,15 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       case c10::ScalarType::Float8_e4m3fn:
         // TODO We need an atomic FP8 reduction for the finalize fusions
         C10_THROW_ERROR_FORMATTED(NotImplementedError,
-            "Outputting " << torch::toString(output_type) << " directly is not currently supported");
+                                  "Outputting " << torch::toString(output_type)
+                                                << " directly is not currently supported");
         // return std::make_unique<kernels::CutlassMoeFCRunner<Type, Type>>();
       case c10::ScalarType::Half:
-        if constexpr (NeedQuant)
-        {
-            return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, half>>();
-        }
-        else
-        {
-            return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, TypeAct>>();
+        if constexpr (NeedQuant) {
+          return std::make_unique<kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, half>>();
+        } else {
+          return std::make_unique<
+              kernels::CutlassMoeFCRunner<TypeAct, TypeWeight, half, TypeAct>>();
         }
 #ifdef ENABLE_BF16
       case c10::ScalarType::BFloat16:
@@ -141,22 +141,19 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       }
     }
 
-    if (isWFP4A16Quant())
-    {
-        mInnerDimMultiplier = 2;
-        if (mActivationDtype == c10::ScalarType::Half)
-        {
-            mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<half, __nv_fp4_e2m1>>();
-        }
+    if (isWFP4A16Quant()) {
+      mInnerDimMultiplier = 2;
+      if (mActivationDtype == c10::ScalarType::Half) {
+        mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<half, __nv_fp4_e2m1>>();
+      }
 #ifdef ENABLE_BF16
-        else if (mActivationDtype == c10::ScalarType::BFloat16)
-        {
-            mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>>();
-        }
+      else if (mActivationDtype == c10::ScalarType::BFloat16) {
+        mKernelRunner =
+            std::make_shared<kernels::CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>>();
+      }
 #endif
     }
 
-
 #endif
     if (isInt4Quant()) {
       mInnerDimMultiplier = 2;
@@ -213,18 +210,22 @@ class FusedMoeRunner : public torch::CustomClassHolder {
   FusedMoeRunner(FusedMoeRunner const&) = delete;
   void operator=(FusedMoeRunner const&) = delete;
 
-  at::Tensor runMoe(
-    at::Tensor& output, at::Tensor const& input, at::Tensor const& token_selected_experts,
-      torch::optional<torch::Tensor> const& token_final_scales, torch::Tensor const& fc1_expert_weights,
-      torch::optional<torch::Tensor> const& fc1_expert_biases, torch::Tensor const& fc2_expert_weights,
-      torch::optional<torch::Tensor> const& fc2_expert_biases,
-      torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
-      torch::optional<torch::Tensor> const& input_sf, torch::optional<torch::Tensor> const& swiglu_alpha,
-      torch::optional<torch::Tensor> const& swiglu_beta, torch::optional<torch::Tensor> const& swiglu_limit,
-      int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
-      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
-{
+  at::Tensor runMoe(at::Tensor& output, at::Tensor const& input,
+                    at::Tensor const& token_selected_experts,
+                    torch::optional<torch::Tensor> const& token_final_scales,
+                    torch::Tensor const& fc1_expert_weights,
+                    torch::optional<torch::Tensor> const& fc1_expert_biases,
+                    torch::Tensor const& fc2_expert_weights,
+                    torch::optional<torch::Tensor> const& fc2_expert_biases,
+                    torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
+                    torch::optional<torch::Tensor> const& input_sf,
+                    torch::optional<torch::Tensor> const& swiglu_alpha,
+                    torch::optional<torch::Tensor> const& swiglu_beta,
+                    torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size,
+                    int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+                    int64_t const cluster_size, int64_t const cluster_rank,
+                    bool const enable_alltoall, bool min_latency_mode,
+                    torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
     std::lock_guard<std::mutex> lock(mMutex);
     // Free the profile workspace to save memory
     freeProfileWorkspace();
@@ -301,32 +302,32 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
     auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
     ActivationType base_activation_type = ActivationType::Swiglu;
-    if (swiglu_alpha.has_value())
-    {
-        CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
-        TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
-            "swiglu_alpha must have num_experts_on_rank elements.");
-        base_activation_type = ActivationType::SwigluBias;
+    if (swiglu_alpha.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_alpha must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
     }
-    if (swiglu_beta.has_value())
-    {
-        CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
-        TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
-            "swiglu_beta must have num_experts_on_rank elements.");
-        base_activation_type = ActivationType::SwigluBias;
+    if (swiglu_beta.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_beta must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
     }
-    if (swiglu_limit.has_value())
-    {
-        CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
-        TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
-            "swiglu_limit must have num_experts_on_rank elements.");
-        base_activation_type = ActivationType::SwigluBias;
+    if (swiglu_limit.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_limit must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
     }
-    auto activation_params = ActivationParams(base_activation_type,
-        reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
-        reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
-        reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
-
+    auto activation_params = ActivationParams(
+        base_activation_type,
+        reinterpret_cast<float const*>(
+            swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
     setRunnerProfiles(profile_ids);
 
@@ -374,10 +375,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace.data_ptr()),
-        output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        static_cast<int>(experts_per_token),
+        static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
+        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
+        mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #endif
 
     return output;
@@ -389,12 +390,13 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       torch::optional<at::Tensor> const& fc1_expert_biases, at::Tensor const& fc2_expert_weights,
       torch::optional<at::Tensor> const& fc2_expert_biases,
       torch::optional<c10::ArrayRef<at::Tensor>> const& quant_scales,
-      torch::optional<torch::Tensor> const& input_sf, torch::optional<torch::Tensor> const& swiglu_alpha,
-      torch::optional<torch::Tensor> const& swiglu_beta, torch::optional<torch::Tensor> const& swiglu_limit,
-      int64_t const tp_size, int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
-      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids)
-{
+      torch::optional<torch::Tensor> const& input_sf,
+      torch::optional<torch::Tensor> const& swiglu_alpha,
+      torch::optional<torch::Tensor> const& swiglu_beta,
+      torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size,
+      int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall,
+      bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids) {
     std::lock_guard<std::mutex> lock(mMutex);
 
     // Free the profile workspace to save memory
@@ -461,25 +463,29 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     if (swiglu_alpha.has_value()) {
       CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
       TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
-          "swiglu_alpha must have num_experts_on_rank elements.");
+                  "swiglu_alpha must have num_experts_on_rank elements.");
       base_activation_type = ActivationType::SwigluBias;
     }
     if (swiglu_beta.has_value()) {
       CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
       TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
-          "swiglu_beta must have num_experts_on_rank elements.");
+                  "swiglu_beta must have num_experts_on_rank elements.");
       base_activation_type = ActivationType::SwigluBias;
     }
     if (swiglu_limit.has_value()) {
       CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
       TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
-          "swiglu_limit must have num_experts_on_rank elements.");
+                  "swiglu_limit must have num_experts_on_rank elements.");
       base_activation_type = ActivationType::SwigluBias;
     }
-    auto activation_params = ActivationParams(base_activation_type,
-        reinterpret_cast<float const*>(swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
-        reinterpret_cast<float const*>(swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
-        reinterpret_cast<float const*>(swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
+    auto activation_params = ActivationParams(
+        base_activation_type,
+        reinterpret_cast<float const*>(
+            swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
     setRunnerProfiles(profile_ids);
 
@@ -541,10 +547,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
         activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace.data_ptr()),
-        output.data_ptr(), static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-        false, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
-        stream);
+        static_cast<int>(experts_per_token),
+        static_cast<char*>(workspace_info.workspace.data_ptr()), output.data_ptr(),
+        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
+        mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, stream);
 #endif
 
     return std::make_tuple(output, num_active_experts_per_node, experts_to_token_score,
@@ -574,8 +580,13 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int64_t const num_rows = input.sizes()[0];
     int64_t const hidden_size = fc2_expert_weights.sizes()[1];
     int64_t const inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
-    int64_t const group_size_ = isInt4Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size : -1;
-    int64_t const group_size = isWFP4A16Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size : group_size_;
+    int64_t const group_size_ =
+        isInt4Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size
+                      : -1;
+    int64_t const group_size =
+        isWFP4A16Quant()
+            ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+            : group_size_;
     int const num_experts = static_cast<int>(fc2_expert_weights.sizes()[0] * ep_size);
 
     // Get specific profile configs according to the profile_id.
@@ -602,8 +613,9 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
       bool const USE_BIAS = fc1_expert_biases.has_value() || fc2_expert_biases.has_value();
       bool const USE_LORA = false;
-      auto activation_dtype =
-          (mUseW4GroupScaling && !isWFP4A16Quant()) ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
+      auto activation_dtype = (mUseW4GroupScaling && !isWFP4A16Quant())
+                                  ? at::ScalarType::Float8_e4m3fn
+                                  : mActivationDtype;
       activation_dtype = isNvfp4Quant() ? at::ScalarType::Long : activation_dtype;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
@@ -960,14 +972,16 @@ class FusedMoeRunner : public torch::CustomClassHolder {
           static_cast<float const*>(fc2_scales.data_ptr()));
     } else if (isWFP4A16Quant()) {
       TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for W4 quantization");
-      TORCH_CHECK(quant_scales.value().size() == 2, "Expecting 2 quant scales for W4A16 quantization");
+      TORCH_CHECK(quant_scales.value().size() == 2,
+                  "Expecting 2 quant scales for W4A16 quantization");
 
       auto& fc1_weight_scales = quant_scales.value()[0];
       auto& fc2_weight_scales = quant_scales.value()[1];
       int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size;
-      return kernels::QuantParams::GroupWise(group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
-          static_cast<void const*>(fc2_weight_scales.data_ptr()), nullptr, nullptr, nullptr, nullptr, nullptr,
-          nullptr);
+      return kernels::QuantParams::GroupWise(group_size,
+                                             static_cast<void const*>(fc1_weight_scales.data_ptr()),
+                                             static_cast<void const*>(fc2_weight_scales.data_ptr()),
+                                             nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
     } else if (isInt4Quant()) {
       TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for INT4 quantization");
       TORCH_CHECK(quant_scales.value().size() == 8,
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
index a335a3f23..a92a14382 100644
--- a/csrc/nv_internal/cpp/kernels/quantization.cu
+++ b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -74,36 +74,37 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 // MXFP8 Quantization
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
-    QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream)
-{
-    // Fixed SF_VEC_SIZE as 32
-    static constexpr int SF_VEC_SIZE = 32;
-
-    // Grid, Block size.
-    // Each thread converts 8 values.
-    dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
-    // Get number of blocks per SM (assume we can fully utilize the SM).
-    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-    // Launch the cvt kernel.
-    cudaLaunchConfig_t config;
-    config.gridDim = grid;
-    config.blockDim = block;
-    config.dynamicSmemBytes = 0;
-    config.stream = stream;
-    cudaLaunchAttribute attrs[1];
-    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
-    config.numAttrs = 1;
-    config.attrs = attrs;
-    cudaLaunchKernelEx(&config,
-        quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b, m, n, padded_n,
-        input, nullptr, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput), layout);
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
+                             int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount,
+                             cudaStream_t stream) {
+  // Fixed SF_VEC_SIZE as 32
+  static constexpr int SF_VEC_SIZE = 32;
+
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = std::max(1u, 2048u / block.x);
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  cudaLaunchConfig_t config;
+  config.gridDim = grid;
+  config.blockDim = block;
+  config.dynamicSmemBytes = 0;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+  config.numAttrs = 1;
+  config.attrs = attrs;
+  cudaLaunchKernelEx(
+      &config,
+      quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_MXFP8, T, SF_VEC_SIZE, true>, b,
+      m, n, padded_n, input, nullptr, reinterpret_cast<uint32_t*>(output),
+      reinterpret_cast<uint32_t*>(SFOuput), layout);
 }
 
-
 // Do per-token (row) quantization from fp16/bf16/fp32 to int8/fp8_e4m3.
 template <typename T, typename QuantT>
 void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows,
@@ -165,9 +166,10 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 // FP4 Quantization
 
 template <typename T, int SF_VEC_SIZE>
-void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout  layout,
-                           int multiProcessorCount, cudaStream_t stream) {
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           QuantizationSFLayout layout, int multiProcessorCount,
+                           cudaStream_t stream) {
 #ifdef ENABLE_FP8
   if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
     // Grid, Block size.
@@ -179,10 +181,13 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS
 
     // Launch the cvt kernel.
     auto* kernel_instance = useUE8M0
-    ? &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, true>
-    : &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4, T, SF_VEC_SIZE, false>;
-kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
-    reinterpret_cast<uint32_t*>(SFOuput), layout);
+                                ? &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4,
+                                                            T, SF_VEC_SIZE, true>
+                                : &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4,
+                                                            T, SF_VEC_SIZE, false>;
+    kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale,
+                                                reinterpret_cast<uint32_t*>(output),
+                                                reinterpret_cast<uint32_t*>(SFOuput), layout);
 
   } else
 #endif
@@ -196,8 +201,10 @@ kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale, reinterp
 
     // Launch the cvt kernel.
     auto* kernel_instance = useUE8M0
-    ? &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, true>
-    : &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4, T, SF_VEC_SIZE, false>;
+                                ? &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4,
+                                                            T, SF_VEC_SIZE, true>
+                                : &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4,
+                                                            T, SF_VEC_SIZE, false>;
 
     cudaLaunchConfig_t config;
     config.gridDim = grid;
@@ -209,14 +216,15 @@ kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale, reinterp
     attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
     config.numAttrs = 1;
     config.attrs = attrs;
-    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
+    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale,
+                       reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
                        layout);
   }
 }
 
 __global__ void block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
-                                                    int numCols, int numColsPadded,
-                                                    uint8_t const* SFIn, uint8_t* SFOutput) {
+                                              int numCols, int numColsPadded, uint8_t const* SFIn,
+                                              uint8_t* SFOutput) {
   for (int rowIdx = blockIdx.x; rowIdx < numRowsPadded; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numColsPadded; colIdx += blockDim.x) {
@@ -240,9 +248,8 @@ __global__ void block_scale_interleave_kernel(int numBatches, int numRows, int n
   }
 }
 
-__global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRows,
-                                                            int numCols, uint8_t const* SFIn,
-                                                            uint8_t* SFOutput) {
+__global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRows, int numCols,
+                                                      uint8_t const* SFIn, uint8_t* SFOutput) {
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numCols; colIdx += blockDim.x) {
@@ -263,21 +270,21 @@ __global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRow
 
 // This is intended for weight loading, so m and n are large, b <= 256
 void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                     uint8_t const* SFIn, uint8_t* SFOutput,
-                                     int multiProcessorCount, cudaStream_t stream) {
+                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
+                                cudaStream_t stream) {
   // Each thread reads 1 int8 value
   dim3 block(std::min(n_padded, 1024));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m_padded, multiProcessorCount * numBlocksPerSM));
 
-  block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn, SFOutput);
+  block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn,
+                                                            SFOutput);
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
-                                            uint8_t* SFOutput, int multiProcessorCount,
-                                            cudaStream_t stream) {
+void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
+                                       int multiProcessorCount, cudaStream_t stream) {
   // Each thread reads 1 int8 value
   dim3 block(std::min(n, 1024));
   // Get number of blocks per SM (assume we can fully utilize the SM).
@@ -288,49 +295,46 @@ void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
 }
 
 // Instantiate the function.
-template void invokeFP4Quantization<half, 16>(int b, int m, int n, half const* input, float const* SFScale,
-                                              int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              QuantizationSFLayout  layout,
-                                              int multiProcessorCount, cudaStream_t stream);
-template void invokeFP4Quantization<half, 32>(int b, int m, int n, half const* input, float const* SFScale,
-                                              int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              QuantizationSFLayout  layout,
-                                              int multiProcessorCount, cudaStream_t stream);
-template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input, int64_t* output,
-                                                int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
-                                            
+template void invokeFP4Quantization<half, 16>(int b, int m, int n, half const* input,
+                                              float const* SFScale, int64_t* output,
+                                              int32_t* SFOuput, bool useUE8M0,
+                                              QuantizationSFLayout layout, int multiProcessorCount,
+                                              cudaStream_t stream);
+template void invokeFP4Quantization<half, 32>(int b, int m, int n, half const* input,
+                                              float const* SFScale, int64_t* output,
+                                              int32_t* SFOuput, bool useUE8M0,
+                                              QuantizationSFLayout layout, int multiProcessorCount,
+                                              cudaStream_t stream);
+template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input,
+                                            int64_t* output, int32_t* SFOuput,
+                                            QuantizationSFLayout layout, int multiProcessorCount,
+                                            cudaStream_t stream);
+
 #ifdef ENABLE_BF16
-template void invokeFP4Quantization<__nv_bfloat16, 16>(int b, int m, int n, __nv_bfloat16 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       QuantizationSFLayout  layout,
-                                                       int multiProcessorCount,
-                                                       cudaStream_t stream);
-template void invokeFP4Quantization<__nv_bfloat16, 32>(int b, int m, int n, __nv_bfloat16 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       QuantizationSFLayout  layout,
-                                                       int multiProcessorCount,
-                                                       cudaStream_t stream);
-template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n, __nv_bfloat16 const* input,
-                                                        int64_t* output, int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount, cudaStream_t stream);
-                                                    
+template void invokeFP4Quantization<__nv_bfloat16, 16>(
+    int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
+    cudaStream_t stream);
+template void invokeFP4Quantization<__nv_bfloat16, 32>(
+    int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
+    cudaStream_t stream);
+template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
+                                                     __nv_bfloat16 const* input, int64_t* output,
+                                                     int32_t* SFOuput, QuantizationSFLayout layout,
+                                                     int multiProcessorCount, cudaStream_t stream);
 
 #endif
 
 #ifdef ENABLE_FP8
-template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int b, int m, int n, __nv_fp8_e4m3 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       QuantizationSFLayout  layout,
-                                                       int multiProcessorCount,
-                                                       cudaStream_t stream);
-template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv_fp8_e4m3 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       QuantizationSFLayout  layout,
-                                                       int multiProcessorCount,
-                                                       cudaStream_t stream);
+template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(
+    int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
+    cudaStream_t stream);
+template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(
+    int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
+    cudaStream_t stream);
 
 #endif
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
index 5b90fbe76..2f49cea35 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -19,6 +19,15 @@
 namespace tensorrt_llm::kernels::cutlass_kernels {
 
 // Note update moe.py to match
-enum class ActivationType { Gelu = 0, Relu, Silu, Swiglu, Geglu, SwigluBias, Identity, InvalidType };
+enum class ActivationType {
+  Gelu = 0,
+  Relu,
+  Silu,
+  Swiglu,
+  Geglu,
+  SwigluBias,
+  Identity,
+  InvalidType
+};
 
 }  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
index 7e3f01031..806c0b44c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -246,7 +246,8 @@ struct TmaWarpSpecializedGroupedGemmInput {
 };
 
 constexpr bool isGatedActivation(ActivationType activation_type) {
-  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu || activation_type == ActivationType::SwigluBias;
+  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu ||
+         activation_type == ActivationType::SwigluBias;
 }
 
 template <typename T,                         /*The type used for activations/scales/compute*/
@@ -257,12 +258,13 @@ template <typename T,                         /*The type used for activations/sc
 class MoeGemmRunner {
  public:
   MoeGemmRunner();
-  
+
 #if defined(ENABLE_BF16)
-  static constexpr bool use_wfp4a16
-      = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> &&
+                                      (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
 #else
-  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+  static constexpr bool use_wfp4a16 =
+      std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
 #endif
 
 #if defined(ENABLE_FP8)
@@ -306,8 +308,11 @@ class MoeGemmRunner {
 
   [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
   [[nodiscard]] bool supportsTmaWarpSpecialized() const;
-  [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n, int gemm_k) const;
-  [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n, int gemm_k) const;
+  [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
+                                            ActivationType activation_type, int gemm_n,
+                                            int gemm_k) const;
+  [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n,
+                                                  int gemm_k) const;
 
   size_t getMaxWorkspaceSize(int num_experts) const;
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index 6367120a8..dbd893f16 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -24,7 +24,6 @@
 #ifdef ENABLE_FP4
 #include <cuda_fp4.h>
 #endif
-#include "tensorrt_llm/common/NvInferRuntime.h"
 #include <cuda_runtime_api.h>
 
 #include <array>
@@ -32,6 +31,8 @@
 #include <optional>
 #include <random>
 #include <utility>
+
+#include "tensorrt_llm/common/NvInferRuntime.h"
 namespace tensorrt_llm::kernels {
 // Change to following declarations must sync with lora.h in public repo
 class LoraImpl;
@@ -84,62 +85,53 @@ struct LoraParams {
 };
 
 namespace cutlass_kernels {
-  static inline size_t pad_to_multiple_of_16(size_t const& input)
-  {
-      static constexpr int ALIGNMENT = 16;
-      return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+static inline size_t pad_to_multiple_of_16(size_t const& input) {
+  static constexpr int ALIGNMENT = 16;
+  return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+
+class CubKeyValueSorter {
+ public:
+  CubKeyValueSorter();
+
+  CubKeyValueSorter(int const num_experts_per_node);
+
+  void updateNumExperts(int const num_experts_per_node);
+
+  static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node);
+
+  void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out,
+           int const* values_in, int* values_out, size_t const num_key_value_pairs,
+           cudaStream_t stream);
+
+ private:
+  static int expertsToBits(int experts);
+  int num_experts_;
+  int num_bits_;
+};
+
+struct ActivationParams {
+  ActivationType activation_type;
+  float const* swiglu_alpha = nullptr;
+  float const* swiglu_beta = nullptr;
+  float const* swiglu_limit = nullptr;
+
+  explicit ActivationParams(ActivationType activation_type) : activation_type(activation_type) {
+    TLLM_CHECK_WITH_INFO(
+        activation_type != ActivationType::SwigluBias,
+        "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
   }
-  
-  class CubKeyValueSorter
-  {
-  public:
-      CubKeyValueSorter();
-  
-      CubKeyValueSorter(int const num_experts_per_node);
-  
-      void updateNumExperts(int const num_experts_per_node);
-  
-      static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node);
-  
-      void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out, int const* values_in,
-          int* values_out, size_t const num_key_value_pairs, cudaStream_t stream);
-  
-  private:
-      static int expertsToBits(int experts);
-      int num_experts_;
-      int num_bits_;
-  };
-  
-  struct ActivationParams
-  {
-      ActivationType activation_type;
-      float const* swiglu_alpha = nullptr;
-      float const* swiglu_beta = nullptr;
-      float const* swiglu_limit = nullptr;
-  
-      explicit ActivationParams(ActivationType activation_type)
-          : activation_type(activation_type)
-      {
-          TLLM_CHECK_WITH_INFO(activation_type != ActivationType::SwigluBias,
-              "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
-      }
-  
-      ActivationParams(
-          ActivationType activation_type, float const* swiglu_alpha, float const* swiglu_beta, float const* swiglu_limit)
-          : activation_type(activation_type)
-          , swiglu_alpha(swiglu_alpha)
-          , swiglu_beta(swiglu_beta)
-          , swiglu_limit(swiglu_limit)
-      {
-      }
-  
-      // TODO Port everything properly and get rid of these implicit conversions
-      operator ActivationType() const
-      {
-          return activation_type;
-      }
-  };
-  
+
+  ActivationParams(ActivationType activation_type, float const* swiglu_alpha,
+                   float const* swiglu_beta, float const* swiglu_limit)
+      : activation_type(activation_type),
+        swiglu_alpha(swiglu_alpha),
+        swiglu_beta(swiglu_beta),
+        swiglu_limit(swiglu_limit) {}
+
+  // TODO Port everything properly and get rid of these implicit conversions
+  operator ActivationType() const { return activation_type; }
+};
 
 /**
  * \brief Describes what parallelism mode the MoE is using
@@ -535,16 +527,17 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   using Self = CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType>;
 
 #if defined(ENABLE_BF16)
-  static constexpr bool use_wfp4a16
-      = std::is_same_v<WeightType, __nv_fp4_e2m1> && (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> &&
+                                      (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
 #else
-  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+  static constexpr bool use_wfp4a16 =
+      std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
 #endif
 
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
-      (std::is_same_v<T, __nv_fp8_e4m3> ||
-       std::is_same_v<T, __nv_fp8_e5m2>)&&!std::is_same_v<WeightType, cutlass::uint4b_t>;
+      (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
+      !std::is_same_v<WeightType, cutlass::uint4b_t>;
   static constexpr bool use_w4afp8 =
       std::is_same_v<WeightType, cutlass::uint4b_t> && std::is_same_v<T, __nv_fp8_e4m3>;
   static_assert(!std::is_same_v<BackBoneType, __nv_fp8_e4m3>,
@@ -642,10 +635,10 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
       TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
       int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-      int64_t const inter_size, int const num_experts_per_node, ActivationParams fc1_activation_type,
-      float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream,
-      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-      int* num_active_experts_per, int* active_expert_global_ids);
+      int64_t const inter_size, int const num_experts_per_node,
+      ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
+      bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config,
+      bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids);
 
   static void gemm2(
       MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index c2526ae69..a7b5dac5a 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -81,7 +81,8 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
 
   // B matrix configuration
   using ElementB_ = typename TllmToCutlassTypeAdapter<WeightType>::type;
-  using ElementB = std::conditional_t<std::is_same_v<WeightType, cutlass::uint4b_t>, cutlass::int4b_t, ElementB_>;
+  using ElementB = std::conditional_t<std::is_same_v<WeightType, cutlass::uint4b_t>,
+                                      cutlass::int4b_t, ElementB_>;
   using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
   constexpr int AlignmentB =
       128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
@@ -100,8 +101,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size
                                          : cutlass::gemm::collective::detail::int4_group_size;
   constexpr int PackedScalesNum = get<2>(CTAShape{}) / group_size;
-  using ElementScale = std::conditional_t<use_wfp4a16, cutlass::float_ue8m0_t,
-      TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA>;
+  using ElementScale =
+      std::conditional_t<use_wfp4a16, cutlass::float_ue8m0_t,
+                         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA>;
   using ElementScalePacked = cutlass::Array<ElementScale, PackedScalesNum>;
   using LayoutScale = cutlass::layout::RowMajor;
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
index c0b9159db..45d4fa64c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
@@ -16,9 +16,8 @@
 
 #include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels
-{
+namespace tensorrt_llm::kernels::cutlass_kernels {
 #ifdef ENABLE_BF16
 template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>;
 #endif
-} // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
index 1da91c2de..12b79eb4c 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
@@ -16,7 +16,6 @@
 
 #include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
 
-namespace tensorrt_llm::kernels::cutlass_kernels
-{
+namespace tensorrt_llm::kernels::cutlass_kernels {
 template class MoeGemmRunner<half, __nv_fp4_e2m1, half>;
 }
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
index a00b71f8a..7f9c0fe14 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -631,9 +631,12 @@ int MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getSM() const {
 // currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsFusedGatedActivation(
-  ActivationType activation_type, int gemm_n, int gemm_k) const {
+    ActivationType activation_type, int gemm_n, int gemm_k) const {
   constexpr bool ENABLE_FUSED_GATED_ACTIVATION = true;
-  return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu) && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8 && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) && ENABLE_FUSED_GATED_ACTIVATION;
+  return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu) &&
+         std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8 &&
+         (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) &&
+         ENABLE_FUSED_GATED_ACTIVATION;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
@@ -672,34 +675,27 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                        "Hopper configuration provided for non-Hopper architecture");
 
   if (sm_ >= 75 && sm_ < 80) {
-    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
-    {
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
-            inputs, multi_processor_count_);
-    }
-    else
-    {
-        TLLM_THROW("FP4 data type is not supported on SM < 90");
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
+      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
+          inputs, multi_processor_count_);
+    } else {
+      TLLM_THROW("FP4 data type is not supported on SM < 90");
     }
   } else if (sm_ >= 80 && sm_ < 90) {
-    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>)
-    {
-        if constexpr (use_fp8 || use_w4afp8)
-        {
-
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
+      if constexpr (use_fp8 || use_w4afp8) {
 #if defined(ENABLE_FP8)
-        static_assert(
-            !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-            "FP8 GEMM Output not supported");
+        static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> &&
+                          !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+                      "FP8 GEMM Output not supported");
 #endif
-      TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
-          inputs, multi_processor_count_);
-      }
-      else
-      {
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
-          inputs, multi_processor_count_);
+        TLLM_CHECK_WITH_INFO(sm_ == 89,
+                             "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
+        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+            inputs, multi_processor_count_);
+      } else {
+        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+            inputs, multi_processor_count_);
       }
     } else {
       TLLM_THROW("FP4 data type is not supported on SM < 90");
@@ -758,40 +754,35 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 #if defined(ENABLE_FP8)
     // Hopper finegrained INT4 WS grouped GEMM
     if constexpr (use_w4afp8) {
-      TLLM_CHECK_WITH_INFO(
-        inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization");
-    // EpilogueTag is ignored
-    if (inputs.k % 512 == 0)
-    {
-    sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-    cutlass_extensions::EpilogueOpDefault, 4>(inputs, hopper_inputs, multi_processor_count_, nullptr);
-}
-else if (inputs.k % 256 == 0)
-{
-sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-    cutlass_extensions::EpilogueOpDefault, 2>(inputs, hopper_inputs, multi_processor_count_, nullptr);
-}
-else if (inputs.k % 128 == 0)
-{
-sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-    cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
-}
-else
-{
-TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k);
-}
-return;
-}
-
-if constexpr (use_wfp4a16)
-{
-TLLM_CHECK_WITH_INFO(
-inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization");
-// EpilogueTag is ignored
-sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-cutlass_extensions::EpilogueOpDefault, 1>(inputs, hopper_inputs, multi_processor_count_, nullptr);
-return;
+      TLLM_CHECK_WITH_INFO(inputs.gemm_config.is_tma_warp_specialized,
+                           "w4afp8 is only supported for TMA warp specialization");
+      // EpilogueTag is ignored
+      if (inputs.k % 512 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 4>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else if (inputs.k % 256 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 2>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else if (inputs.k % 128 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 1>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else {
+        TLLM_THROW("Invalid GEMM K size %d", (int)inputs.k);
+      }
+      return;
+    }
 
+    if constexpr (use_wfp4a16) {
+      TLLM_CHECK_WITH_INFO(inputs.gemm_config.is_tma_warp_specialized,
+                           "wfp4a16 is only supported for TMA warp specialization");
+      // EpilogueTag is ignored
+      sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                    cutlass_extensions::EpilogueOpDefault, 1>(
+          inputs, hopper_inputs, multi_processor_count_, nullptr);
+      return;
     }
 #endif
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 722a49292..6b1702d58 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -160,10 +160,10 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
   // perform the best for mixed type gemms.
 
   constexpr int Ntile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 64 : 128;
-  constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 128 : 128 * PackedScalesNum / sizeof(T);
+  constexpr int Ktile =
+      (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 128 : 128 * PackedScalesNum / sizeof(T);
   TLLM_CHECK(sizeof(T) == (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 2 : 1);
 
-
   using _Ntile = Int<Ntile>;
   using _Ktile = Int<Ktile>;
   switch (inputs.gemm_config.tile_config_sm90) {
@@ -249,11 +249,12 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   GroupedGemmInput<T, WeightType, OutputType, OutputType> inputs{};
   inputs.num_experts = num_experts;
-  sm90_generic_mixed_moe_gemm_kernelLauncher<
-      T, WeightType, OutputType, tensorrt_llm::cutlass_extensions::EpilogueOpDefault,
-      Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative,
-      cutlass::epilogue::TmaWarpSpecializedCooperative,
-      cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
+  sm90_generic_mixed_moe_gemm_kernelLauncher<T, WeightType, OutputType,
+                                             tensorrt_llm::cutlass_extensions::EpilogueOpDefault,
+                                             Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>,
+                                             cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+                                             cutlass::epilogue::TmaWarpSpecializedCooperative,
+                                             cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
       inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count);
 #endif
   return count;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
index 76e6659c4..d6b89680e 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -67,9 +67,9 @@ constexpr bool isValidHopperMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
   return (cutlass::platform::is_same<T, WeightType>::value ||
           (cutlass::platform::is_same<cutlass::uint4b_t, WeightType>::value &&
-            cutlass::platform::is_same<T, __nv_fp8_e4m3>::value)
-            || (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value
-                && !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value) ||
+          (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value &&
+           !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
 
 #ifdef ENABLE_FP4
          && !cutlass::platform::is_same<T, __nv_fp4_e2m1>::value
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
index f4bc1d39f..9d0796c27 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -354,24 +354,22 @@ inline __device__ uint64_t fp32_vec_to_e2m1(float2 (&array)[8]) {
 }
 
 // Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
-inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4])
-{
-    union
-    {
-        uint64_t val;
-        __nv_fp8x2_e4m3 elts[4];
-    } u;
-
-    static_assert(sizeof(u.val) == sizeof(u.elts), "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
-
-    u.elts[0] = __nv_fp8x2_e4m3(array[0]);
-    u.elts[1] = __nv_fp8x2_e4m3(array[1]);
-    u.elts[2] = __nv_fp8x2_e4m3(array[2]);
-    u.elts[3] = __nv_fp8x2_e4m3(array[3]);
-    return u.val;
+inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4]) {
+  union {
+    uint64_t val;
+    __nv_fp8x2_e4m3 elts[4];
+  } u;
+
+  static_assert(sizeof(u.val) == sizeof(u.elts),
+                "Expected to alias uint64_t and __nv_fp8x2_e4m3[4]");
+
+  u.elts[0] = __nv_fp8x2_e4m3(array[0]);
+  u.elts[1] = __nv_fp8x2_e4m3(array[1]);
+  u.elts[2] = __nv_fp8x2_e4m3(array[2]);
+  u.elts[3] = __nv_fp8x2_e4m3(array[3]);
+  return u.val;
 }
 
-
 // Fast reciprocal.
 inline __device__ float reciprocal_approximate_ftz(float a) {
   float b;
@@ -379,13 +377,11 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
   return b;
 }
 
-__device__ __forceinline__ float exp2f_rcp(uint8_t exp)
-{
-    constexpr uint32_t FP32_EXPONENT_BIAS = 127;
-    return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
+__device__ __forceinline__ float exp2f_rcp(uint8_t exp) {
+  constexpr uint32_t FP32_EXPONENT_BIAS = 127;
+  return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
 }
 
-
 // Define a 16 bytes packed data type.
 template <class Type>
 struct PackedVec {
@@ -410,11 +406,11 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 
 // Local maximum value.
 #pragma unroll
-  for (int i = 1; i < CVT_ELTS_PER_THREAD  / 2; i++) {
+  for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) {
     localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
   }
 
-  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD ;
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
   // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
   localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
   if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
@@ -429,17 +425,17 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
     __nv_fp8_e8m0 tmp;
-        // Scale the max value to the range of E2m1.
-        vecMax *= reciprocal_approximate_ftz(6.0f);
-        tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
+    // Scale the max value to the range of E2m1.
+    vecMax *= reciprocal_approximate_ftz(6.0f);
+    tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
 
     fp8SFVal = tmp.__x;
     outputScale = vecMax != 0 ? exp2f_rcp(fp8SFVal) : 0.0f;
   } else {
-            // Get the SF (max value of the vector / max value of e2m1).
-        // maximum value of e2m1 = 6.0.
-        // TODO: use half as compute data type.
-        auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+    // Get the SF (max value of the vector / max value of e2m1).
+    // maximum value of e2m1 = 6.0.
+    // TODO: use half as compute data type.
+    auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
 
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
@@ -447,11 +443,11 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
     SFValue = static_cast<float>(tmp);
     // Get the output scale.
     // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal))
-    outputScale = vecMax != 0 ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal)) : 0.0f;
-
+    outputScale = vecMax != 0
+                      ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                      : 0.0f;
   }
 
-
   if (SFout) {
     // Write the SF to global memory (STG.8).
     *SFout = fp8SFVal;
@@ -564,72 +560,64 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 
 // Quantizes the provided PackedVec into the uint64_t output
 template <class Type, int SF_VEC_SIZE>
-__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout)
-{
+__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    // Get absolute maximum values among the local 8 values.
-    auto localMax = cuda_abs(vec.elts[0]);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = cuda_abs(vec.elts[0]);
 
 // Local maximum value.
 #pragma unroll
-    for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++)
-    {
-        localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
-    }
+  for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
+  }
 
-    constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
-    // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
-    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-    if constexpr (CVT_NUM_THREADS_PER_SF == 4)
-    {
-        localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
-    }
-    // Get the final absolute maximum values.
-    float vecMax = float(cuda_max(localMax.x, localMax.y));
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
+  // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
+  localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
+    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
+  }
+  // Get the final absolute maximum values.
+  float vecMax = float(cuda_max(localMax.x, localMax.y));
 
-    // Get the SF (max value of the vector / max value of mxfp8).
-    float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
-    // 8 bits representation of the SF.
-    uint8_t fp8SFVal;
+  // Get the SF (max value of the vector / max value of mxfp8).
+  float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  __nv_fp8_e8m0 tmpSFVal;
+  tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+  SFValue = static_cast<float>(tmpSFVal);
+  fp8SFVal = tmpSFVal.__x;
+  // Get the output scale (reciprocal of the SFValue).
+  float outputScale = vecMax != 0.f ? reciprocal_approximate_ftz(SFValue) : 0.0f;
+
+  if (SFout) {
     // Write the SF to global memory (STG.8).
-    __nv_fp8_e8m0 tmpSFVal;
-    tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    SFValue = static_cast<float>(tmpSFVal);
-    fp8SFVal = tmpSFVal.__x;
-    // Get the output scale (reciprocal of the SFValue).
-    float outputScale = vecMax != 0.f ? reciprocal_approximate_ftz(SFValue) : 0.0f;
-
-    if (SFout)
-    {
-        // Write the SF to global memory (STG.8).
-        *SFout = fp8SFVal;
-    }
+    *SFout = fp8SFVal;
+  }
 
-    // Convert the input to float.
-    float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
+  // Convert the input to float.
+  float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
 
 #pragma unroll
-    for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++)
-    {
-        if constexpr (std::is_same_v<Type, half>)
-        {
-            fp2Vals[i] = __half22float2(vec.elts[i]);
-        }
-        else
-        {
-            fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-        }
-        fp2Vals[i].x *= outputScale;
-        fp2Vals[i].y *= outputScale;
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
     }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
 
-    // Convert to e4m3 values.
-    uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
+  // Convert to e4m3 values.
+  uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
 
-    // Write the e4m3 values to global memory.
-    return e4m3Vec;
+  // Write the e4m3 values to global memory.
+  return e4m3Vec;
 #else
-    return 0;
+  return 0;
 #endif
 }
 
@@ -656,9 +644,9 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
   int32_t kTileIdx = (kIdx / 4);
   int64_t kTileStride = 32 * outerMStride;  // 512
 
-    // SF vector size 16 or 32. We round the "numCols" up to a multiple of 64 or 128.
-    // It is the same as rounding the "numColVecs" up to a multiple of 4.
-    int32_t numKTiles = (numColVecs + 4 - 1) / 4;
+  // SF vector size 16 or 32. We round the "numCols" up to a multiple of 64 or 128.
+  // It is the same as rounding the "numColVecs" up to a multiple of 4.
+  int32_t numKTiles = (numColVecs + 4 - 1) / 4;
 
   int32_t mTileIdx = mIdx / (32 * 4);
   int64_t mTileStride = numKTiles * kTileStride;
@@ -677,16 +665,17 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
 
 template <class SFType, int CVT_NUM_THREADS_PER_SF>
 __device__ uint8_t* cvt_quant_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
-                                                       int colVecIdx, std::optional<int> numRows,
-                                                       int numColVecs, SFType* SFout,
-                                                       QuantizationSFLayout layout) {
+                                                int colVecIdx, std::optional<int> numRows,
+                                                int numColVecs, SFType* SFout,
+                                                QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 || CVT_NUM_THREADS_PER_SF == 4);
+  static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 ||
+                CVT_NUM_THREADS_PER_SF == 4);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_NUM_THREADS_PER_SF  == 0) {
+  if (threadIdx.x % CVT_NUM_THREADS_PER_SF == 0) {
     if (layout == QuantizationSFLayout::SWIZZLED) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
@@ -717,115 +706,101 @@ static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 || CVT_
 template <BlockScaleQuantizationType quantization_type, class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __global__ void
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-    __launch_bounds__(512, 4) quantize_with_block_size(
+__launch_bounds__(512, 4) quantize_with_block_size(
 #else
 quantize_with_block_size(
 #endif
-        int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
-        float const* SFScale, uint32_t* out, uint32_t* SFout, QuantizationSFLayout layout)
-{
+    int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
+    float const* SFScale, uint32_t* out, uint32_t* SFout, QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
-    // The elements per thread.
-    static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
-        ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
-        : CVT_ELTS_PER_THREAD;
-
-    using PackedVec = PackedVec<Type>;
-    static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD; // 2 or 4
-    static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
-
-    // Get the global scaling factor, which will be applied to the SF.
-    // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-    float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-    // Is it swizzled layout?
-    bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED;
-
-    // The number of padded rows considering 128x4 SF layout.
-    int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
-    int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
-
-    // The number of threads in the column dimension。
-    // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
-    int numColThreads = numCols / ELTS_PER_THREAD;
-    int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
-    int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
-
-    asm volatile("griddepcontrol.wait;");
-    // Input tensor batch/row/col loops.
-    for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x)
-    {
-        for (int batchIdx = 0; batchIdx < numbatches; batchIdx++)
-        {
-            for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x)
-            {
-                std::optional<int> optionalBatchIdx = batchIdx;
-                std::optional<int> optionalNumRows = numRows;
-
-                // The SF output pointer.
-                auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
-                    optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout, layout);
-
-                // The input tensor offset.
-                int64_t inOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
-                int64_t outOffset = static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
-
-                // Set the values to 0 of those are padded columns.
-                if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads)
-                {
-                    // Dispatch the quantization kernel.
-                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
-                    {
-                        reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
-                    }
-                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
-                        || quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
-                    {
-                        reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
-                    }
-                }
-
-                // Set the SF padding to 0.
-                if (rowIdx >= numRows || colIdx >= numColThreads)
-                {
-                    // Set the SF padding to 0.
-                    if (sf_out != nullptr)
-                    {
-                        sf_out[0] = 0x00;
-                    }
-                }
-                else
-                {
-                    // Load the input vector.
-                    PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-
-                    // Dispatch the quantization kernel.
-                    if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4)
-                    {
-                        reinterpret_cast<uint32_t*>(out)[outOffset]
-                            = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-                    }
-                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4)
-                    {
-                        reinterpret_cast<uint64_t*>(out)[outOffset]
-                            = cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-                    }
-                    else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8)
-                    {
-                        reinterpret_cast<uint64_t*>(out)[outOffset]
-                            = cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
-                    }
-                }
-            }
+  // The elements per thread.
+  static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
+                                             ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
+                                             : CVT_ELTS_PER_THREAD;
+
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD;  // 2 or 4
+  static_assert(sizeof(PackedVec) == sizeof(Type) * ELTS_PER_THREAD, "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Is it swizzled layout?
+  bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED;
+
+  // The number of padded rows considering 128x4 SF layout.
+  int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
+  int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
+
+  // The number of threads in the column dimension。
+  // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
+  int numColThreads = numCols / ELTS_PER_THREAD;
+  int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
+  int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
+
+  asm volatile("griddepcontrol.wait;");
+  // Input tensor batch/row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numPaddedRowsForSf; rowIdx += gridDim.x) {
+    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
+      for (int colIdx = threadIdx.x; colIdx < numColThreadsForSf; colIdx += blockDim.x) {
+        std::optional<int> optionalBatchIdx = batchIdx;
+        std::optional<int> optionalNumRows = numRows;
+
+        // The SF output pointer.
+        auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+            optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout,
+            layout);
+
+        // The input tensor offset.
+        int64_t inOffset =
+            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numColThreads + colIdx;
+        int64_t outOffset =
+            static_cast<int64_t>(batchIdx * numRows + rowIdx) * numPaddedColThreads + colIdx;
+
+        // Set the values to 0 of those are padded columns.
+        if (rowIdx < numRows && colIdx >= numColThreads && colIdx < numPaddedColThreads) {
+          // Dispatch the quantization kernel.
+          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
+            reinterpret_cast<uint32_t*>(out)[outOffset] = 0u;
+          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4 ||
+                               quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
+            reinterpret_cast<uint64_t*>(out)[outOffset] = 0ull;
+          }
         }
+
+        // Set the SF padding to 0.
+        if (rowIdx >= numRows || colIdx >= numColThreads) {
+          // Set the SF padding to 0.
+          if (sf_out != nullptr) {
+            sf_out[0] = 0x00;
+          }
+        } else {
+          // Load the input vector.
+          PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+
+          // Dispatch the quantization kernel.
+          if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_FP4) {
+            reinterpret_cast<uint32_t*>(out)[outOffset] =
+                cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP8_TO_FP4) {
+            reinterpret_cast<uint64_t*>(out)[outOffset] =
+                cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal,
+                                                                          sf_out);
+          } else if constexpr (quantization_type == BlockScaleQuantizationType::FP16_TO_MXFP8) {
+            reinterpret_cast<uint64_t*>(out)[outOffset] =
+                cvt_warp_fp16_to_mxfp8<Type, SF_VEC_SIZE>(in_vec, sf_out);
+          }
+        }
+      }
     }
-    asm volatile("griddepcontrol.launch_dependents;");
+  }
+  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 
-
 __global__ void block_scale_interleave_kernel(int numbatches, int numRows, int numCols,
-                                                    uint8_t const* SFIn, uint8_t* SFOutput);
+                                              uint8_t const* SFIn, uint8_t* SFOutput);
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
index 1ef3a7982..32e51008f 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
@@ -48,7 +48,7 @@ enum class BlockScaleQuantizationType {
 #define PadUpFn(X, Y) ((X + Y - 1) / (Y) * (Y))
 
 // totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
-inline int64_t  computeSwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
+inline int64_t computeSwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
   int paddedRow = PadUpFn(totalRow, rowSize);
   int paddedColumn = PadUpFn(totalColumn, 4);
   return static_cast<int64_t>(paddedRow) * paddedColumn;
@@ -71,19 +71,21 @@ void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows
                                 cudaStream_t stream = 0);
 
 template <typename T, int SF_VEC_SIZE>
-void invokeFP4Quantization(int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout,
-                           int multiProcessorCount, cudaStream_t stream = 0);
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* globalScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           QuantizationSFLayout layout, int multiProcessorCount,
+                           cudaStream_t stream = 0);
 
-void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded, uint8_t const* SFIn,
-                                uint8_t* SFOutput, int multiProcessorCount, cudaStream_t stream = 0);
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
+                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
+                                cudaStream_t stream = 0);
 
 void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
                                        int multiProcessorCount, cudaStream_t stream = 0);
 
 template <typename T>
-void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output, int32_t* SFOuput,
-                             QuantizationSFLayout layout, int multiProcessorCount,
+void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
+                             int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount,
                              cudaStream_t stream = 0);
 
 }  // namespace kernels
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index a47bebc33..fcc7a3487 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -178,8 +178,8 @@ at::Tensor BlockScaleInterleave(at::Tensor const& blockScale) {
           if (rIdx < static_cast<int>(rows) && cIdx < static_cast<int>(cols)) {
             sf_ori = blockScalePtr[cIdx];
           }
-          int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED);
+          int sf_index =
+              computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::QuantizationSFLayout::SWIZZLED);
           interleavedBlockScalePtr[sf_index] = sf_ori;
         }
       }
@@ -224,8 +224,8 @@ at::Tensor BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
     for (int eIdx = 0; eIdx < num_experts; eIdx++) {
       for (int rIdx = 0; rIdx < rows; ++rIdx) {
         for (int cIdx = 0; cIdx < cols; ++cIdx) {
-          int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED);
+          int sf_index =
+              computeSFIndex(rIdx, cIdx, rows, cols, tensorrt_llm::QuantizationSFLayout::SWIZZLED);
           identity[eIdx * expert_out_size + sf_index] = std::array<int, 3>{eIdx, rIdx, cIdx};
         }
       }
@@ -292,8 +292,8 @@ at::Tensor E2M1AndUFP8SFScaleToFloat(at::Tensor valueE2M1, at::Tensor scaleFP8SF
 
 // Used by the (fp16 -> int4) quant layer + int4 gemm network.
 at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8SF,
-  std::optional<at::Tensor> globalScale, int64_t sfVecSize, int64_t sfType,
-                                       bool isSfSwizzledLayout = true) {
+                                       std::optional<at::Tensor> globalScale, int64_t sfVecSize,
+                                       int64_t sfType, bool isSfSwizzledLayout = true) {
   CHECK_CPU_INPUT(valueE2M1, FLOAT4_E2M1X2);
   CHECK_CPU_INPUT(scaleFP8SF, SF_DTYPE);
   auto packedShape = valueE2M1.sizes();
@@ -305,21 +305,19 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
 
   // CHECK_CPU_INPUT(globalScale, at::ScalarType::Float);
   float globalScaleVal{1.0f};
-  if (sfType == 1)
-  {
-      TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfType is 1.");
-      // CHECK_CPU_INPUT(globalScale.value(), at::kFloat32);
-      globalScaleVal = globalScale->data_ptr<float>()[0];
+  if (sfType == 1) {
+    TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfType is 1.");
+    // CHECK_CPU_INPUT(globalScale.value(), at::kFloat32);
+    globalScaleVal = globalScale->data_ptr<float>()[0];
   }
 
-
   int hiddenDim = packedShape[1] * 2;
   int packedFp4HiddenDim = hiddenDim / 2;
   int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
-  tensorrt_llm::QuantizationSFLayout layout =
-      isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
-                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout = isSfSwizzledLayout
+                                                  ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                  : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t vIdx = 0; vIdx < static_cast<size_t>(packedShape[0]); ++vIdx) {
     for (int group = 0; group < groupsPerHiddenDim; ++group) {
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
index 0ac21710d..22236f9d3 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -39,26 +39,22 @@ namespace torch_ext {
 // self_fp4, self_block_scale_factors self_fp4: [M, K / 2], FLOAT4_E2M1X2 self_block_scale_factors:
 // ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4, SF_DTYPE (UE4M3 or UE8M0)
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-  std::optional<at::Tensor> const& globalScale, int64_t sfVecSize,
-                                                bool sfUseUE8M0, bool isSfSwizzledLayout,
-                                                bool isSf8x4Layout) {
+                                                std::optional<at::Tensor> const& globalScale,
+                                                int64_t sfVecSize, bool sfUseUE8M0,
+                                                bool isSfSwizzledLayout, bool isSf8x4Layout) {
   CHECK_TH_CUDA(self);
   CHECK_CONTIGUOUS(self);
-  if (sfUseUE8M0)
-  {
-      TORCH_CHECK(sfVecSize == 32, "sfVecSize can only be 32, when sfUseUE8M0 is true");
-  }
-  else
-  {
-      TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfUseUE8M0 is false");
-      // CHECK_INPUT_AND_TYPE(globalScale.value(), torch::kFloat32);
-      TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16, when sfUseUE8M0 is false");
+  if (sfUseUE8M0) {
+    TORCH_CHECK(sfVecSize == 32, "sfVecSize can only be 32, when sfUseUE8M0 is true");
+  } else {
+    TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfUseUE8M0 is false");
+    // CHECK_INPUT_AND_TYPE(globalScale.value(), torch::kFloat32);
+    TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16, when sfUseUE8M0 is false");
   }
 
   float* globalScalePtr{nullptr};
-  if (globalScale.has_value())
-  {
-      globalScalePtr = globalScale->data_ptr<float>();
+  if (globalScale.has_value()) {
+    globalScalePtr = globalScale->data_ptr<float>();
   }
 
   auto const& inputShape = self.sizes();
@@ -78,10 +74,9 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
   at::Tensor valueE2M1 =
       at::detail::empty_cuda(outputShape, FLOAT4_E2M1X2, self.device(), /* stride */ std::nullopt);
 
-  int64_t SFSize =
-      isSfSwizzledLayout
-          ? tensorrt_llm::computeSwizzledLayoutSFSize(m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
-          : tensorrt_llm::computeLinearLayoutSFSize(m, k / sfVecSize);
+  int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(
+                                            m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
+                                      : tensorrt_llm::computeLinearLayoutSFSize(m, k / sfVecSize);
 
   at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, self.device(),
                                                  /* stride */ std::nullopt);  // 1D tensor
@@ -92,9 +87,10 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
   layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
                               : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
-  #define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                                     \
-  tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(1, m, k, reinterpret_cast<T*>(self.data_ptr()),       \
-      globalScalePtr, reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                              \
+#define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                 \
+  tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(                                    \
+      1, m, k, reinterpret_cast<T*>(self.data_ptr()), globalScalePtr,                              \
+      reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                            \
       reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, layout, mMultiProcessorCount, \
       at::cuda::getCurrentCUDAStream(self.get_device()));
 
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
index 43134f409..16de2991b 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
@@ -24,7 +24,7 @@
 
 namespace torch_ext {
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-                                                std::optional<at::Tensor> const& globalScale, int64_t sfVecSize,
-                                                bool sfUseUE8M0, bool isSfSwizzledLayout,
-                                                bool isSf8x4Layout);
+                                                std::optional<at::Tensor> const& globalScale,
+                                                int64_t sfVecSize, bool sfUseUE8M0,
+                                                bool isSfSwizzledLayout, bool isSf8x4Layout);
 }  // namespace torch_ext
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
index 9ccfdd4cf..3763afffe 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
@@ -28,7 +28,8 @@ namespace torch_ext {
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
 // linear layout. See QuantizationSFLayout enum for more details about the two layouts.
 // returns
-std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout, int64_t alignment) {
+std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout,
+                                                  int64_t alignment) {
   CHECK_TH_CUDA(input);
   CHECK_CONTIGUOUS(input);
 
@@ -51,38 +52,40 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
   std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
   outputShape[rank - 1] = padded_k;
 
-  at::Tensor valMxFP8
-      = at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, input.device(), /* stride */ std::nullopt);
+  at::Tensor valMxFP8 = at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn,
+                                               input.device(), /* stride */ std::nullopt);
 
-  int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(m, padded_k / SF_VEC_SIZE)
-                                      : tensorrt_llm::computeLinearLayoutSFSize(m, padded_k / SF_VEC_SIZE);
+  int64_t SFSize = isSfSwizzledLayout
+                       ? tensorrt_llm::computeSwizzledLayoutSFSize(m, padded_k / SF_VEC_SIZE)
+                       : tensorrt_llm::computeLinearLayoutSFSize(m, padded_k / SF_VEC_SIZE);
 
-  at::Tensor scaleFP8SF
-      = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(), /* stride */ std::nullopt); // 1D tensor
+  at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(),
+                                                 /* stride */ std::nullopt);  // 1D tensor
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
   auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
                                          : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
-#define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                                                \
-  tensorrt_llm::kernels::invokeMxFP8Quantization(1, m, k, padded_k, reinterpret_cast<T*>(input.data_ptr()),             \
-      reinterpret_cast<int64_t*>(valMxFP8.data_ptr()), reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout,      \
-      mMultiProcessorCount, at::cuda::getCurrentCUDAStream(input.get_device()));
+#define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                \
+  tensorrt_llm::kernels::invokeMxFP8Quantization(                                      \
+      1, m, k, padded_k, reinterpret_cast<T*>(input.data_ptr()),                       \
+      reinterpret_cast<int64_t*>(valMxFP8.data_ptr()),                                 \
+      reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout, mMultiProcessorCount, \
+      at::cuda::getCurrentCUDAStream(input.get_device()));
 
-  if (input.scalar_type() == at::ScalarType::Half)
-  {
-      LAUNCH_MXFP8_QUANTIZE_KERNEL(half)
-  }
-  else if (input.scalar_type() == at::ScalarType::BFloat16)
-  {
+  if (input.scalar_type() == at::ScalarType::Half) {
+    LAUNCH_MXFP8_QUANTIZE_KERNEL(half)
+  } else if (input.scalar_type() == at::ScalarType::BFloat16) {
 #ifdef ENABLE_BF16
-      LAUNCH_MXFP8_QUANTIZE_KERNEL(__nv_bfloat16)
+    LAUNCH_MXFP8_QUANTIZE_KERNEL(__nv_bfloat16)
 #else
-    C10_THROW_ERROR(NotImplementedError, "BFloat16 must be enabled to quantize an bf16 tensor to mxfp8.");
+    C10_THROW_ERROR(NotImplementedError,
+                    "BFloat16 must be enabled to quantize an bf16 tensor to mxfp8.");
 #endif
   } else {
-    C10_THROW_ERROR(NotImplementedError, "mxfp8_quantize only supports input tensor with dtypes fp16/bf16.");
+    C10_THROW_ERROR(NotImplementedError,
+                    "mxfp8_quantize only supports input tensor with dtypes fp16/bf16.");
   }
 
 #undef LAUNCH_MXFP8_QUANTIZE_KERNEL
@@ -125,9 +128,9 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize_host(at::Tensor x_fp32,
   at::Tensor scale_tensor =
       at::detail::empty_cpu({sf_size}, SF_DTYPE, /* pinned */ true, at::MemoryFormat::Contiguous);
 
-  tensorrt_llm::QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
-                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout = is_sf_swizzled_layout
+                                                  ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                  : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
@@ -176,9 +179,9 @@ at::Tensor mxfp8_dequantize_host(at::Tensor value_e4m3, at::Tensor scale_ue8m08s
   int hidden_dim = data_shape[1];
   int groups_per_hidden_dim = hidden_dim / sf_vec_size;
 
-  tensorrt_llm::QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
-                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout = is_sf_swizzled_layout
+                                                  ? tensorrt_llm::QuantizationSFLayout::SWIZZLED
+                                                  : tensorrt_llm::QuantizationSFLayout::LINEAR;
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
       float* float_ptr = float_tensor.data_ptr<float>() + ti * hidden_dim + group * sf_vec_size;
diff --git a/csrc/trtllm_allreduce_fusion.cu b/csrc/trtllm_allreduce_fusion.cu
index 37672dae3..b9f38a66a 100644
--- a/csrc/trtllm_allreduce_fusion.cu
+++ b/csrc/trtllm_allreduce_fusion.cu
@@ -70,9 +70,8 @@ void trtllm_allreduce_fusion(
                               ? reinterpret_cast<float*>(scale_factor.value().data_ptr())
                               : nullptr;
     params.use_oneshot = use_oneshot;
-    params.layout = layout_code.has_value()
-                        ? static_cast<QuantizationSFLayout>(layout_code.value())
-                        : QuantizationSFLayout::SWIZZLED;
+    params.layout = layout_code.has_value() ? static_cast<QuantizationSFLayout>(layout_code.value())
+                                            : QuantizationSFLayout::SWIZZLED;
     params.pattern = static_cast<AllReduceFusionPattern>(pattern_code);
     params.trigger_completion_at_end = trigger_completion_at_end;
     params.stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
index d4c279b96..85ba3d334 100644
--- a/csrc/trtllm_fused_moe_kernel_launcher.cu
+++ b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -817,7 +817,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
   std::optional<at::Tensor> gemm1_output_scale = std::nullopt;
   if (dtype_act == btg::Dtype::E2m1 || dtype_act == btg::Dtype::MxE4m3) {
     int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens,
-                                                                   intermediate_size / sf_vec_size);
+                                                                intermediate_size / sf_vec_size);
     gemm1_output_scale = at::detail::empty_cuda({sf_size}, at::ScalarType::Float8_e4m3fn,
                                                 hidden_states.device(), std::nullopt);
   }
@@ -877,7 +877,7 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
     TORCH_CHECK(hidden_states_scale.value().dim() == 1, "hidden_states_scale must be 1D.");
     TORCH_CHECK(hidden_states_scale.value().sizes()[0] ==
                     tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens,
-                                                               args.hidden_size / sf_vec_size),
+                                                            args.hidden_size / sf_vec_size),
                 "hidden_states_scale has incorrect size");
   }
 
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index b96828110..5249a0206 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -50,14 +50,14 @@
 from .decode import single_decode_with_kv_cache as single_decode_with_kv_cache
 from .fp4_quantization import (
     SfLayout,
+    block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    block_scale_interleave,
+    mxfp4_dequantize,
+    mxfp4_quantize,
     nvfp4_quantize,
     shuffle_matrix_a,
     shuffle_matrix_sf_a,
-    mxfp4_quantize,
-    mxfp4_dequantize,
 )
 from .fp8_quantization import mxfp8_dequantize_host, mxfp8_quantize
 from .fused_moe import (
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 6c5e2dcfe..0e7ed9176 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -454,4 +454,4 @@ def mxfp4_dequantize(a_fp4, a_sf):
         32,
         0,
         True,
-    )
\ No newline at end of file
+    )
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
index 4ad9a33e6..c44390f46 100644
--- a/tests/test_fp4_quantize.py
+++ b/tests/test_fp4_quantize.py
@@ -5,9 +5,9 @@
 from utils_fp4 import cast_from_fp4, recover_swizzled_scales, ref_nvfp4_quant
 
 from flashinfer import (
+    block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    block_scale_interleave,
 )
 from flashinfer.utils import is_sm100a_supported
 
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index c0b082b94..1091e6306 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -19,10 +19,17 @@
 from torch.nn import functional as F
 
 import flashinfer.fused_moe as fused_moe
-from flashinfer import fp4_quantize, mxfp4_quantize, mxfp8_quantize, mxfp8_dequantize_host, e2m1_and_ufp8sf_scale_to_float, mxfp4_dequantize
+from flashinfer import (
+    e2m1_and_ufp8sf_scale_to_float,
+    fp4_quantize,
+    mxfp4_dequantize,
+    mxfp4_quantize,
+    mxfp8_dequantize_host,
+    mxfp8_quantize,
+)
 
 FLOAT4_E2M1_MAX = 6.0
-FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max  
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
 FP8_DTYPE = torch.float8_e4m3fn
 
 
@@ -160,8 +167,17 @@ def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
         out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
     ).sum(dim=1)
 
+
 def compute_with_experts(
-    num_experts, x, w31_weight, w2_weight, selected_experts, routing_weights, alpha=None, beta=None, limit=None
+    num_experts,
+    x,
+    w31_weight,
+    w2_weight,
+    selected_experts,
+    routing_weights,
+    alpha=None,
+    beta=None,
+    limit=None,
 ):
     results = torch.zeros_like(x)
     for expert_id in range(num_experts):
@@ -183,10 +199,12 @@ def compute_with_experts(
             x1_scaled = x1 * torch.sigmoid(alpha * x1)
             x2 = expert_inputs @ w3_expert.t()
             x2 = x2.clamp_(min=-limit, max=limit) + beta
-            
+
             inter = x1_scaled * x2
         else:
-            inter = F.silu(expert_inputs @ w1_expert.t()) * (expert_inputs @ w3_expert.t())
+            inter = F.silu(expert_inputs @ w1_expert.t()) * (
+                expert_inputs @ w3_expert.t()
+            )
         output = inter @ w2_expert.t()
         results[batch_idx] += routing_weights[batch_idx, nth_expert, None] * output
     return results.view_as(x)
@@ -489,6 +507,7 @@ def test_moe_nvfp4(
     )
     torch.testing.assert_close(ref_output, flash_output, rtol=2e-1, atol=2e-1)
 
+
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("num_experts", EP_NUM_EXPERTS)
@@ -1056,10 +1075,12 @@ def dequant_mxfp4_batches(
 
     scale_tensor = scale_tensor.view(num_batches, -1)
 
-    return torch.stack([
-        mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
-        for b in range(num_batches)
-    ])
+    return torch.stack(
+        [
+            mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
+            for b in range(num_batches)
+        ]
+    )
 
 
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
@@ -1068,7 +1089,9 @@ def dequant_mxfp4_batches(
 @pytest.mark.parametrize("top_k", TOP_K_VALUES)
 @pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
 @pytest.mark.parametrize("otype", [torch.float16, torch.bfloat16])
-@pytest.mark.parametrize(("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)])
+@pytest.mark.parametrize(
+    ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
+)
 def test_moe_mxfp8_mxfp4(
     batch_size,
     hidden_size,
@@ -1105,9 +1128,9 @@ def test_moe_mxfp8_mxfp4(
     mxfp4_w1, mxfp4_w1_scale = quant_mxfp4_batches(w1, e)
     mxfp4_w2, mxfp4_w2_scale = quant_mxfp4_batches(w2, e)
 
-    router_logits = torch.randn(m, e, dtype=otype).cuda() 
+    router_logits = torch.randn(m, e, dtype=otype).cuda()
     routing_weights, selected_experts = compute_routing(router_logits, top_k)
-    
+
     fake_input_scale = torch.ones(e, device=x.device)
 
     quant_scales = [
@@ -1116,9 +1139,9 @@ def test_moe_mxfp8_mxfp4(
         mxfp4_w2_scale.view(torch.int32),
         fake_input_scale,
     ]
-    
+
     flash_output = torch.zeros_like(x)
-    
+
     if alpha is not None and limit is not None and beta is not None:
         alpha_t = torch.ones(e, device=x.device) * alpha
         limit_t = torch.ones(e, device=x.device) * limit
@@ -1144,26 +1167,46 @@ def test_moe_mxfp8_mxfp4(
         use_mxfp8_act_scaling=True,
         output=flash_output,
     )
-    
-    dq_mxfp8_x = mxfp8_dequantize_host(
-        mxfp8_x.cpu().view(torch.uint8), 
-        mxfp8_x_sf.cpu().view(torch.uint8).reshape(-1), 
-        True
-    ).cuda().to(otype)
-
-    dq_mfxp4_w1 = dequant_mxfp4_batches(
-        mxfp4_w1.cpu().view(torch.uint8),
-        mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
-    ).cuda().to(otype)
-
-    dq_mfxp4_w2 = dequant_mxfp4_batches(
-        mxfp4_w2.cpu().view(torch.uint8),
-        mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
-    ).cuda().to(otype)
+
+    dq_mxfp8_x = (
+        mxfp8_dequantize_host(
+            mxfp8_x.cpu().view(torch.uint8),
+            mxfp8_x_sf.cpu().view(torch.uint8).reshape(-1),
+            True,
+        )
+        .cuda()
+        .to(otype)
+    )
+
+    dq_mfxp4_w1 = (
+        dequant_mxfp4_batches(
+            mxfp4_w1.cpu().view(torch.uint8),
+            mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(otype)
+    )
+
+    dq_mfxp4_w2 = (
+        dequant_mxfp4_batches(
+            mxfp4_w2.cpu().view(torch.uint8),
+            mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(otype)
+    )
 
     # Use original weights for reference computation
     ref_output = compute_with_experts(
-        e, dq_mxfp8_x, dq_mfxp4_w1, dq_mfxp4_w2, selected_experts, routing_weights, alpha, beta, limit
+        e,
+        dq_mxfp8_x,
+        dq_mfxp4_w1,
+        dq_mfxp4_w2,
+        selected_experts,
+        routing_weights,
+        alpha,
+        beta,
+        limit,
     )
 
     torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)

From e13b3b291109cec4c9bed582eacbb23820cddfb1 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Mon, 11 Aug 2025 12:27:17 -0700
Subject: [PATCH 04/12] wip

---
 flashinfer/aot.py                      |   8 +-
 flashinfer/fp4_quantization.py         |  60 +++++++------
 flashinfer/fused_moe/__init__.py       |   4 +-
 flashinfer/fused_moe/core.py           |  36 +++++---
 tests/test_trtllm_cutlass_fused_moe.py | 113 +++++++++++++++++++++++++
 5 files changed, 179 insertions(+), 42 deletions(-)

diff --git a/flashinfer/aot.py b/flashinfer/aot.py
index 2e2885a7c..c4e0e6fb7 100644
--- a/flashinfer/aot.py
+++ b/flashinfer/aot.py
@@ -12,8 +12,8 @@
 from .activation import act_func_def_str, gen_act_and_mul_module
 from .cascade import gen_cascade_module
 from .comm.nvshmem import gen_nvshmem_module
-from .fp4_quantization import gen_fp4_quantization_sm100_module
-from .fused_moe import gen_cutlass_fused_moe_sm100_module
+from .fp4_quantization import gen_fp4_quantization_module
+from .fused_moe import gen_cutlass_fused_moe_module
 from .gemm import gen_gemm_module, gen_gemm_sm90_module, gen_gemm_sm100_module
 from .jit import JitSpec, build_jit_specs
 from .jit import env as jit_env
@@ -366,8 +366,8 @@ def gen_all_modules(
         if has_sm90:
             jit_specs.append(gen_gemm_sm90_module())
         if has_sm100:
-            jit_specs.append(gen_cutlass_fused_moe_sm100_module())
-            jit_specs.append(gen_fp4_quantization_sm100_module())
+            jit_specs.append(gen_cutlass_fused_moe_module())
+            jit_specs.append(gen_fp4_quantization_module())
             jit_specs.append(gen_gemm_sm100_module())
 
     if add_comm:
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 15622db6b..3cb590338 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -23,7 +23,7 @@
 
 from .jit import JitSpec
 from .jit import env as jit_env
-from .jit import gen_jit_spec, sm100a_nvcc_flags
+from .jit import gen_jit_spec, sm100a_nvcc_flags, sm90a_nvcc_flags
 from .utils import (
     get_shuffle_matrix_a_row_indices,
     get_shuffle_matrix_sf_a_row_indices,
@@ -61,9 +61,21 @@ def _pad_scale_factors(
         ).contiguous()
 
 
-def gen_fp4_quantization_sm100_module() -> JitSpec:
+@functools.cache
+def get_device_arch():
+    major, minor = torch.cuda.get_device_capability()
+    suffix = "a" if major >= 9 else ""
+    return f"{major * 10 + minor}{suffix}"
+
+
+def gen_fp4_quantization_module() -> JitSpec:
+    if get_device_arch() == "100a":
+        nvcc_flags = sm100a_nvcc_flags
+    else:
+        nvcc_flags = sm90a_nvcc_flags
+
     return gen_jit_spec(
-        "fp4_quantization_sm100",
+        "fp4_quantization",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/thop/fp4Quantize.cpp",
@@ -74,7 +86,7 @@ def gen_fp4_quantization_sm100_module() -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags
+        extra_cuda_cflags=nvcc_flags
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
@@ -91,14 +103,14 @@ def gen_fp4_quantization_sm100_module() -> JitSpec:
 
 
 @functools.cache
-def get_fp4_quantization_sm100_module():
-    module = gen_fp4_quantization_sm100_module().build_and_load()
+def get_fp4_quantization_module():
+    module = gen_fp4_quantization_module().build_and_load()
 
     @register_custom_op(
-        "flashinfer::fp4_quantize_sm100",
+        "flashinfer::fp4_quantize",
         mutates_args=(""),
     )
-    def fp4_quantize_sm100(
+    def fp4_quantize(
         input: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
         sf_vec_size: int = 16,
@@ -130,8 +142,8 @@ def fp4_quantize_sm100(
             is_sf_8x4_layout,
         )
 
-    @register_fake_op("flashinfer::fp4_quantize_sm100")
-    def _fake_fp4_quantize_sm100(
+    @register_fake_op("flashinfer::fp4_quantize")
+    def _fake_fp4_quantize(
         input: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
         sf_vec_size: int = 16,
@@ -145,10 +157,10 @@ def _fake_fp4_quantize_sm100(
         )
 
     @register_custom_op(
-        "flashinfer::block_scale_interleave_sm100",
+        "flashinfer::block_scale_interleave",
         mutates_args=("",),
     )
-    def block_scale_interleave_sm100(
+    def block_scale_interleave(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         """Swizzle block scale tensor for FP4 format.
@@ -163,8 +175,8 @@ def block_scale_interleave_sm100(
             unswizzled_sf,
         )
 
-    @register_fake_op("flashinfer::block_scale_interleave_sm100")
-    def _fake_block_scale_interleave_sm100(
+    @register_fake_op("flashinfer::block_scale_interleave")
+    def _fake_block_scale_interleave(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         return unswizzled_sf.new_empty(
@@ -172,10 +184,10 @@ def _fake_block_scale_interleave_sm100(
         )
 
     @register_custom_op(
-        "flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100",
+        "flashinfer::e2m1_and_ufp8sf_scale_to_float",
         mutates_args=(""),
     )
-    def e2m1_and_ufp8sf_scale_to_float_sm100(
+    def e2m1_and_ufp8sf_scale_to_float(
         e2m1_tensor: torch.Tensor,
         ufp8_scale_tensor: torch.Tensor,
         global_scale_tensor: Optional[torch.Tensor] = None,
@@ -208,8 +220,8 @@ def e2m1_and_ufp8sf_scale_to_float_sm100(
             is_sf_swizzled_layout,
         )
 
-    @register_fake_op("flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100")
-    def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
+    @register_fake_op("flashinfer::e2m1_and_ufp8sf_scale_to_float")
+    def _fake_e2m1_and_ufp8sf_scale_to_float(
         e2m1_tensor: torch.Tensor,
         ufp8_scale_tensor: torch.Tensor,
         global_scale_tensor: Optional[torch.Tensor] = None,
@@ -223,9 +235,9 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
 
     # Register the module
     return SimpleNamespace(
-        fp4_quantize_sm100=fp4_quantize_sm100,
-        block_scale_interleave_sm100=block_scale_interleave_sm100,
-        e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
+        fp4_quantize=fp4_quantize,
+        block_scale_interleave=block_scale_interleave,
+        e2m1_and_ufp8sf_scale_to_float=e2m1_and_ufp8sf_scale_to_float,
     )
 
 
@@ -270,7 +282,7 @@ def fp4_quantize(
         input = input.transpose(-2, -1)
 
     assert input.shape[-1] % sf_vec_size == 0
-    x_q, sf = get_fp4_quantization_sm100_module().fp4_quantize_sm100(
+    x_q, sf = get_fp4_quantization_module().fp4_quantize(
         input,
         global_scale,
         sf_vec_size,
@@ -305,7 +317,7 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     assert unswizzled_sf.dtype == torch.uint8, (
         f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
     )
-    return get_fp4_quantization_sm100_module().block_scale_interleave_sm100(
+    return get_fp4_quantization_module().block_scale_interleave(
         unswizzled_sf,
     )
 
@@ -336,7 +348,7 @@ def e2m1_and_ufp8sf_scale_to_float(
 
     """
 
-    return get_fp4_quantization_sm100_module().e2m1_and_ufp8sf_scale_to_float_sm100(
+    return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float(
         e2m1_tensor,
         ufp8_scale_tensor,
         global_scale_tensor,
diff --git a/flashinfer/fused_moe/__init__.py b/flashinfer/fused_moe/__init__.py
index 47bf098ac..1bd23326b 100644
--- a/flashinfer/fused_moe/__init__.py
+++ b/flashinfer/fused_moe/__init__.py
@@ -19,7 +19,7 @@
     WeightLayout,
     convert_to_block_layout,
     cutlass_fused_moe,
-    gen_cutlass_fused_moe_sm100_module,
+    gen_cutlass_fused_moe_module,
     reorder_rows_for_gated_act_gemm,
     trtllm_fp4_block_scale_moe,
     trtllm_fp4_block_scale_routed_moe,
@@ -32,7 +32,7 @@
     "WeightLayout",
     "convert_to_block_layout",
     "cutlass_fused_moe",
-    "gen_cutlass_fused_moe_sm100_module",
+    "gen_cutlass_fused_moe_module",
     "reorder_rows_for_gated_act_gemm",
     "trtllm_fp4_block_scale_moe",
     "trtllm_fp8_block_scale_moe",
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 7042274b6..02a9359ae 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -32,7 +32,7 @@
 )
 from ..jit import JitSpec
 from ..jit import env as jit_env
-from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags
+from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags, sm90a_nvcc_flags
 from ..jit.cutlass_gemm.generate_kernels import generate_gemm_operations
 from ..utils import _check_shape_dtype_device, register_custom_op, register_fake_op
 from .utils import (
@@ -114,11 +114,23 @@ def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Te
     return input_tensor.view(M, K // blockK, blockK).permute(1, 0, 2).contiguous()
 
 
-def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
+def get_device_arch():
+    major, minor = torch.cuda.get_device_capability()
+    suffix = "a" if major >= 9 else ""
+    return f"{major * 10 + minor}{suffix}"
+
+
+def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
     output_dir = (
         jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
     )
 
+    print(f"get_device_arch(): {get_device_arch()}")
+    if get_device_arch() == "100a":
+        nvcc_flags = sm100a_nvcc_flags
+    else:
+        nvcc_flags = sm90a_nvcc_flags
+
     required_kernels_sm100 = [
         # M128 kernels
         "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group0.generated.cu",
@@ -180,7 +192,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
         raise RuntimeError(f"Failed to generate Cutlass kernels: {e}") from e
 
     return gen_jit_spec(
-        "fused_moe_sm100",
+        "fused_moe",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu",
@@ -233,7 +245,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/lora/lora.cpp",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags
+        extra_cuda_cflags=nvcc_flags
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
@@ -269,8 +281,8 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
 
 
 @functools.cache
-def get_cutlass_fused_moe_sm100_module(use_fast_build: bool = False):
-    module = gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load(
+def get_cutlass_fused_moe_module(use_fast_build: bool = False):
+    module = gen_cutlass_fused_moe_module(use_fast_build).build_and_load(
         class_name="FusedMoeRunner"
     )
 
@@ -399,10 +411,10 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
             )
 
     @register_custom_op(
-        "flashinfer::cutlass_fused_moe_sm100",
+        "flashinfer::cutlass_fused_moe",
         mutates_args=(""),
     )
-    def cutlass_fused_moe_sm100(
+    def cutlass_fused_moe(
         output: torch.Tensor,
         input: torch.Tensor,
         token_selected_experts: torch.Tensor,
@@ -512,8 +524,8 @@ def cutlass_fused_moe_sm100(
 
         return result if min_latency_mode else [result]
 
-    @register_fake_op("flashinfer::cutlass_fused_moe_sm100")
-    def _fake_cutlass_fused_moe_sm100(
+    @register_fake_op("flashinfer::cutlass_fused_moe")
+    def _fake_cutlass_fused_moe(
         output: torch.Tensor,
         input: torch.Tensor,
         token_selected_experts: torch.Tensor,
@@ -560,7 +572,7 @@ def _fake_cutlass_fused_moe_sm100(
 
     # Register the module
     return SimpleNamespace(
-        cutlass_fused_moe_sm100=cutlass_fused_moe_sm100,
+        cutlass_fused_moe=cutlass_fused_moe,
     )
 
 
@@ -732,7 +744,7 @@ def cutlass_fused_moe(
             output, output_shape, output_dtype, input.device, "output"
         )
 
-    return get_cutlass_fused_moe_sm100_module().cutlass_fused_moe_sm100(
+    return get_cutlass_fused_moe_module().cutlass_fused_moe(
         output,
         input,
         token_selected_experts,
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index c1ed6adb6..72f593105 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -1203,5 +1203,118 @@ def test_moe_mxfp8_mxfp4(
     torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
 
 
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.parametrize(
+    ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
+)
+def test_moe_bf16_mxfp4(
+    batch_size,
+    hidden_size,
+    num_experts,
+    top_k,
+    intermediate_size,
+    alpha,
+    beta,
+    limit,
+):
+    """
+    Test MoE with bf16 activations and MXFP4 weights.
+    Uses bf16 for activations and fp4_quantize for weights.
+    """
+    # Skip invalid configurations
+    if top_k > num_experts:
+        pytest.skip(
+            f"top_k ({top_k}) cannot be greater than num_experts ({num_experts})"
+        )
+
+    torch.manual_seed(42)
+    e = num_experts
+    m = batch_size
+    n = intermediate_size
+    k = hidden_size
+
+    x = torch.randn(m, k, dtype=torch.bfloat16).cuda()
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10
+
+    mxfp4_w1, mxfp4_w1_scale = quant_mxfp4_batches(w1, e)
+    mxfp4_w2, mxfp4_w2_scale = quant_mxfp4_batches(w2, e)
+
+    router_logits = torch.randn(m, e, dtype=torch.bfloat16).cuda()
+    routing_weights, selected_experts = compute_routing(router_logits, top_k)
+
+    fake_input_scale = torch.ones(e, device=x.device)
+
+    quant_scales = [
+        mxfp4_w1_scale.view(torch.int32),
+        fake_input_scale,
+        mxfp4_w2_scale.view(torch.int32),
+        fake_input_scale,
+    ]
+
+    flash_output = torch.zeros_like(x)
+
+    if alpha is not None and limit is not None and beta is not None:
+        alpha_t = torch.ones(e, device=x.device) * alpha
+        limit_t = torch.ones(e, device=x.device) * limit
+        beta_t = torch.ones(e, device=x.device) * beta
+    else:
+        alpha_t = None
+        limit_t = None
+        beta_t = None
+
+    # Call cutlass_fused_moe with MXFP8 activations and MXFP4 weights
+    _ = fused_moe.cutlass_fused_moe(
+        x,
+        selected_experts.to(torch.int),
+        routing_weights,
+        mxfp4_w1.contiguous().view(torch.long),
+        mxfp4_w2.contiguous().view(torch.long),
+        torch.bfloat16,
+        swiglu_alpha=alpha_t,
+        swiglu_limit=limit_t,
+        swiglu_beta=beta_t,
+        quant_scales=quant_scales,
+        output=flash_output,
+    )
+
+    dq_mfxp4_w1 = (
+        dequant_mxfp4_batches(
+            mxfp4_w1.cpu().view(torch.uint8),
+            mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    dq_mfxp4_w2 = (
+        dequant_mxfp4_batches(
+            mxfp4_w2.cpu().view(torch.uint8),
+            mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    # Use original weights for reference computation
+    ref_output = compute_with_experts(
+        e,
+        x,
+        dq_mfxp4_w1,
+        dq_mfxp4_w2,
+        selected_experts,
+        routing_weights,
+        alpha,
+        beta,
+        limit,
+    )
+
+    torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From fb6b9e28225f3a1584dbf420a4a4a97341c504ca Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Wed, 13 Aug 2025 13:25:25 -0700
Subject: [PATCH 05/12] final changes to enable bf16xmxfp4 moe for hopper

---
 .../detail/collective/mixed_input_utils.hpp   |  599 +++++++
 .../builders/sm90_gmma_builder_gated.inl      |  244 +++
 .../sm90_gmma_builder_interleaved.inl         |  156 ++
 .../sm90_gmma_builder_mixed_input.inl         |  240 +++
 .../collective/collective_builder_gated.hpp   |   42 +
 .../collective_builder_interleaved.hpp        |   41 +
 .../collective_builder_mixed_input.hpp        |   41 +
 .../collective_mma_array_mixed_input.hpp      |   42 +
 .../gemm/collective/collective_mma_gated.hpp  |   44 +
 .../collective/collective_mma_interleaved.hpp |   42 +
 ...a_gmma_rs_warpspecialized_mixed_input_.hpp | 1474 ++++++++++++++++
 ..._mma_gated_tma_gmma_ss_warpspecialized.hpp |  630 +++++++
 ..._gated_tma_gmma_ss_warpspecialized_fp8.hpp |  644 +++++++
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp | 1528 +++++++++++++++++
 .../include/cutlass_extensions/gemm_configs.h |   41 +-
 .../bf16_int4_gemm_fg_scalebias.cu            |   28 +
 .../bf16_int4_gemm_fg_scaleonly.cu            |   28 +
 .../fpA_intB_gemm/bf16_int4_gemm_per_col.cu   |   28 +
 .../bf16_int8_gemm_fg_scalebias.cu            |   28 +
 .../bf16_int8_gemm_fg_scaleonly.cu            |   28 +
 .../fpA_intB_gemm/bf16_int8_gemm_per_col.cu   |   28 +
 ...m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu |   33 +
 ...e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu |   33 +
 ...m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu |   33 +
 ...e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu |   33 +
 .../e4m3_int4_gemm_per_col_f16_out_f16.cu     |   33 +
 .../fp16_int4_gemm_fg_scalebias.cu            |   26 +
 .../fp16_int4_gemm_fg_scaleonly.cu            |   26 +
 .../fpA_intB_gemm/fp16_int4_gemm_per_col.cu   |   26 +
 .../fp16_int8_gemm_fg_scalebias.cu            |   26 +
 .../fp16_int8_gemm_fg_scaleonly.cu            |   26 +
 .../fpA_intB_gemm/fp16_int8_gemm_per_col.cu   |   26 +
 .../fpA_intB_gemm/fpA_intB_gemm.h             |  138 ++
 .../fpA_intB_gemm/fpA_intB_gemm_template.h    |  592 +++++++
 .../fpA_intB_gemm_template_sm90.h             |  287 ++++
 .../launchers/fpA_intB_launcher_sm90.h        |   39 +
 .../launchers/fpA_intB_launcher_sm90.inl      |  294 ++++
 .../moe_gemm_tma_ws_mixed_input_launcher.inl  |    4 +-
 csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp  |   52 +
 flashinfer/__init__.py                        |    1 +
 flashinfer/fp4_quantization.py                |   89 +-
 flashinfer/fp8_quantization.py                |    9 +-
 flashinfer/fused_moe/core.py                  |  119 +-
 flashinfer/jit/core.py                        |    6 +-
 .../jit/cutlass_gemm/generate_kernels.py      |   33 +-
 flashinfer/utils.py                           |    7 +
 tests/test_fp4_quantize.py                    |   16 +
 tests/test_groupwise_scaled_gemm_mxfp4.py     |    4 +-
 tests/test_trtllm_cutlass_fused_moe.py        |   76 +-
 tests/test_trtllm_gen_fused_moe.py            |    6 +-
 50 files changed, 7905 insertions(+), 164 deletions(-)
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
 create mode 100644 csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl

diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
new file mode 100644
index 000000000..9059f7a52
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/util/type_traits.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+using namespace cute;
+
+typedef uint32_t __nv_fp4x8_storage_t;
+typedef uint32_t __nv_bf16x2_storage_t;
+typedef cutlass::uint128_t __nv_bf16x8_storage_t;
+
+constexpr int int4_group_size = 128;
+constexpr int mxfp4_group_size = 32;
+
+inline __device__ unsigned prmt(unsigned hi, unsigned lo, unsigned select_code) {
+  unsigned res = 0;
+
+  asm volatile(
+      "{\n"
+      "prmt.b32 %0, %1, %2, %3;\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(lo), "r"(hi), "r"(select_code));
+
+  return res;
+}
+
+__device__ __inline__ __nv_fp8x4_storage_t cvt_lut_bf16(unsigned const index) {
+  const __nv_fp8x4_storage_t h4b_lut = 0x03020100U;  // 7654
+  const __nv_fp8x4_storage_t l4b_lut = 0xFFFEFC00U;  // 3210
+
+  __nv_fp8x4_storage_t lut_res = prmt(h4b_lut, l4b_lut, index);
+
+  return lut_res;
+}
+
+__device__ __inline__ __nv_bf16x8_storage_t psx_cvt_lut_prmt_fp4x8_to_bf16x8(
+    const __nv_fp4x8_storage_t fp4x8) {
+  __nv_bf16x8_storage_t bf16x8_raw = {0, 0};
+  __nv_bf16x2_storage_t* bf16x2_raw = reinterpret_cast<__nv_bf16x2_storage_t*>(&bf16x8_raw);
+
+  unsigned zero_padding = 0x00000000U;
+
+  unsigned h4b_em_fp4x4 = (fp4x8 & 0x77770000U) >> 16U;
+  unsigned l4b_em_fp4x4 = (fp4x8 & 0x00007777U);
+
+  __nv_fp8x4_storage_t h4b_2to9_bits = cvt_lut_bf16(h4b_em_fp4x4);  // 7654
+  __nv_fp8x4_storage_t l4b_2to9_bits = cvt_lut_bf16(l4b_em_fp4x4);  // 3210
+
+  bf16x2_raw[0] = prmt(zero_padding, l4b_2to9_bits, 0x1707U) >> 2U;  // 1 0
+  bf16x2_raw[1] = prmt(zero_padding, l4b_2to9_bits, 0x3727U) >> 2U;  // 3 2
+  bf16x2_raw[2] = prmt(h4b_2to9_bits, zero_padding, 0x5040U) >> 2U;  // 5 4
+  bf16x2_raw[3] = prmt(h4b_2to9_bits, zero_padding, 0x7060U) >> 2U;  // 7 6
+
+  __nv_bf16x2_storage_t bf16x2_0to1_bits;
+
+  __nv_fp8x4_storage_t h_fp8x2_0to1_bits = (fp4x8 & 0x0000C0C0U);        // 3 1
+  __nv_fp8x4_storage_t l_fp8x2_0to1_bits = (fp4x8 & 0x00000C0CU) << 4U;  // 2 0
+
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x4707U);  // 1 0
+  bf16x2_raw[0] = bf16x2_raw[0] | bf16x2_0to1_bits;
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x5717U);  // 3 2
+  bf16x2_raw[1] = bf16x2_raw[1] | bf16x2_0to1_bits;
+
+  h_fp8x2_0to1_bits = (fp4x8 & 0xC0C00000U);        // 7 5
+  l_fp8x2_0to1_bits = (fp4x8 & 0x0C0C0000U) << 4U;  // 6 4
+
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x6020U);  // 5 4
+  bf16x2_raw[2] = bf16x2_raw[2] | bf16x2_0to1_bits;
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x7030U);  // 7 6
+  bf16x2_raw[3] = bf16x2_raw[3] | bf16x2_0to1_bits;
+
+  return bf16x8_raw;
+}
+
+template <class Collective>
+struct MixedGroupedGemmInputUtils {
+ private:
+  using KernelSchedule = typename Collective::KernelSchedule;
+  using ConversionMode = typename Collective::ConversionMode;
+  using SmemLayoutA = typename Collective::SmemLayoutA;
+  using SmemLayoutB = typename Collective::SmemLayoutB;
+  using SmemLayoutScale = typename Collective::SmemLayoutScale;
+  using SwappedElementA = typename Collective::SwappedElementA;
+  using SwappedElementB = typename Collective::SwappedElementB;
+  using RealSwappedElementA = typename Collective::RealSwappedElementA;
+  using RealSwappedElementB = typename Collective::RealSwappedElementB;
+  using ElementScale = typename Collective::ElementScale;
+  using ElementZero = typename Collective::ElementZero;
+  using SmemCopyAtomScale = typename Collective::SmemCopyAtomScale;
+  static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
+  static constexpr auto ModeHasScales = Collective::ModeHasScales;
+  static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+  static constexpr auto UseFP4ToBF16LookupTable = Collective::UseFP4ToBF16LookupTable;
+
+ public:
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after
+  // the public section.
+  static constexpr uint32_t compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementA>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementB>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes =
+          cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                 static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0,
+                    "Each scale stage must be 128B aligned.");  // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes =
+            cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                   static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0,
+                      "Each zero stage must be 128B aligned.");  // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Type not handled in tma transaction bytes computation.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA, class TensorASmemView, class TensorACopyView, class... Ts,
+            class... Us>
+  CUTLASS_DEVICE static void copy_tensors_MK(SmemTiledCopyA const& smem_tiled_copy_A,
+                                             TensorASmemView const& tCsA,
+                                             TensorACopyView& tCrA_copy_view,
+                                             cute::tuple<Ts...> const& partitioned_mma_extra_info,
+                                             cute::tuple<Us...> const& tiled_copy_and_views,
+                                             int k_block, int read_stage) {
+    copy(smem_tiled_copy_A, tCsA(_, _, k_block, read_stage), tCrA_copy_view(_, _, k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage), tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage), tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut, class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(  // Accept mutable temporaries
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>&& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    lookup_table_convert(src, dst, scales_neg, scales_pos);
+  }
+
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut, class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    constexpr int N = cute::cosize(LayoutIn{});
+    static_assert(N == 4 || N == 8);
+    static_assert(cosize(LayoutScale{}) <= N / 4,
+                  "at least 4 consecutive weights must share the same scale.");
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<RealSwappedElementB, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, N / 4, sizeof(DstArray)>;
+
+    // View the input as reg
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r = cute::recast<RegArray>(dst)(0);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign;  // ((reg & 0x88888888) | 0x64206420) >> 1
+    asm volatile(
+        "{\n"
+        "  lop3.b32 %0, %1, %2, %3, %4;\n"
+        "}\n"
+        : "=r"(sign)
+        : "r"(src_reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut));
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = src_reg & 0x77777777;
+    Tensor scales_neg_ = cute::filter(scales_neg);
+    Tensor scales_pos_ = cute::filter(scales_pos);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>= 16, sign >>= 16) {
+      auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_(i));
+      auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_pos_(i));
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos, neg                    ;\n"
+          "  prmt .b32 neg, %3, %4, %1             ;\n"
+          "  prmt .b32 pos, %5, %6, %1             ;\n"
+          "  prmt .b32 %0, pos, neg, %2            ;\n"
+          "}\n"
+          : "=r"(r[i])
+          : "r"(lut_idx), "r"(sign), "r"(scale_neg_[0]), "r"(scale_neg_[1]), "r"(scale_pos_[0]),
+            "r"(scale_pos_[1]));
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn, class LayoutIn, class EngineOut,
+            class LayoutOut>
+  CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert(  // Accept mutable temporaries
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>&& dst) {
+    fp4tobf16_lookup_table_convert(src, dst);
+  }
+
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut>
+  CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert(Tensor<EngineIn, LayoutIn> const& src,
+                                                            Tensor<EngineOut, LayoutOut>& dst) {
+    // View the input as reg
+    auto&& src_ = cute::recast<__nv_fp4x8_storage_t>(src)(0);
+    auto&& dst_ = cute::recast<__nv_bf16x8_storage_t>(dst)(0);
+
+    dst_ = psx_cvt_lut_prmt_fp4x8_to_bf16x8(src_);
+  }
+
+  /// Utilities to dequantize A.
+  template <class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Layout const& tensor) {
+    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0,
+                  "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+
+  template <class Engine, class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void dequantize_A_kblock(Tensor<EngineIn, LayoutIn> const& tCrA_load,
+                                                 Tensor<EngineOut, LayoutOut>& tCrA_mma,
+                                                 cute::tuple<Ts...>& partitioned_extra_info,
+                                                 int const k_block) {
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg =
+        cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    } else if constexpr (UseScaleLookupTable) {
+      constexpr int num_elements = decltype(size(src))::value;
+      static_assert(is_same_v<RealSwappedElementA, cutlass::int4b_t>,
+                    "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64,
+                    "Lookup table only supports 8 8bit scale values now.");
+      static_assert(num_elements % 4 == 0 && num_elements >= 4,
+                    "Lookup table requires a vector size of 4x when converting.");
+
+      Tensor tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto&& tCrS_pos =
+          cute::get<2>(partitioned_extra_info);  // modification to its value is needed
+      Tensor scales_neg = tCrS_neg(_, _, k_block);
+      Tensor scales_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(cute::size(src) == cute::size(scales_neg));
+
+      static_check_scale(scales_neg);
+      static_check_scale(scales_pos);
+      Tensor scales_neg_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales_neg, Int<NumValPerSrcReg>{}));
+      Tensor scales_pos_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales_pos, Int<NumValPerSrcReg>{}));
+
+      if (k_block == 0) {
+        Tensor scales_neg_vm_ = filter(scales_neg_vm);
+        Tensor scales_pos_vm_ = filter(scales_pos_vm);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(scales_neg_vm_.layout()); ++i) {
+          auto&& scale_neg_ =
+              reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
+          auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>&>(scales_pos_vm_(i));
+          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+          asm volatile(
+              "{\n"
+              "  lop3 .b32 %0, %2, %4, %5, %6;\n"
+              "  xor  .b32 %1, %3, %5;        \n"
+              "}\n"
+              : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080),
+                "n"(immLut));
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        lookup_table_convert(src_vm(_, i), dst_vm(_, i), scales_neg_vm(_, i), scales_pos_vm(_, i));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      Tensor scales_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) *= scales_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) *= scales_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>,
+                    "ElementScale and ElementZero must be the same.");
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      Tensor zeros = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      Tensor scales_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+      Tensor zeros_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(zeros, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) = dst_vm(j, i) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) = stage(j) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void convert_A_kblock(Tensor<EngineIn, LayoutIn> const& tCrA_load,
+                                              Tensor<EngineOut, LayoutOut>& tCrA_mma,
+                                              int const k_block) {
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg =
+        cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    // KernelConversionMode == ConversionMode::DirectConvert
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(dst_vm); ++i) {
+      if constexpr (UseFP4ToBF16LookupTable) {
+        fp4tobf16_lookup_table_convert(src_vm(_, i), dst_vm(_, i));
+      } else {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    }
+  }
+
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class Params, class TensorStorage, class... Ts>
+  CUTLASS_DEVICE static auto partition_extra_tma_inputs(Params const& mainloop_params,
+                                                        cute::tuple<Ts...> const& load_inputs,
+                                                        TensorStorage& shared_tensors,
+                                                        uint2 const& cluster_local_block_id,
+                                                        int const m_coord, int const l_coord) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);  // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);  // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);  // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for input partitioning.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma, class TensorStorage>
+  CUTLASS_DEVICE static auto partition_extra_mma_info(ThreadMma const& mma_thread_slice,
+                                                      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (UseScaleLookupTable) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      return cute::make_tuple(tCsS, tCrS);
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(
+            mma_thread_slice.partition_fragment_A(sZ(_, _, Int<0>{})).layout());
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE static auto retile_extra_mma_info(TiledMma const& tiled_mma,
+                                                   cute::tuple<Ts...>& partitioned_extra_info,
+                                                   int const warp_group_thread_idx) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view =
+          smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view =
+            smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+};
+
+}  // namespace cutlass::gemm::collective::detail
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
new file mode 100644
index 000000000..a11451764
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or
+// overrides with manual count.
+template <int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, bool SwapAB,
+          int carveout_bytes>
+constexpr int compute_stage_count_or_override_gated(
+    StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // 32 bytes to account for barriers etc.
+  constexpr int stage_barrier_bytes = 32;
+  constexpr int a_bits = static_cast<int>(sizeof_bits<ElementA>::value);
+  constexpr int b_bits = static_cast<int>(sizeof_bits<ElementB>::value);
+  constexpr int stage_bytes = [&]() -> int {
+    if constexpr (SwapAB) {
+      return (a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{}) * 2) / 8 +
+             (b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) / 8 + stage_barrier_bytes;
+    } else {
+      return (a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) / 8 +
+             (b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{}) * 2) / 8 +
+             stage_barrier_bytes;
+    }
+  }();
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS
+template <class ElementA, class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
+          int AlignmentB, class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType,
+          template <class /* ElementCompute */> class Activation, bool SwapAB>
+struct CollectiveBuilderGated<
+    arch::Sm90, arch::OpClassTensorOp, ElementA, GmemLayoutA, AlignmentA, ElementB, GmemLayoutB,
+    AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType, Activation, SwapAB,
+    cute::enable_if_t<
+        (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative>) &&
+        not detail::is_use_rmem_A<ElementA, GmemLayoutA, ElementB, GmemLayoutB>()>> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm =
+      (cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input || (IsFP8Input && !IsArrayOfPointersGemm),
+                "Kernel[Array/Group]TmaWarpSpecializedCooperative is only compatible with FP8 "
+                "FastAccum version right now\n");
+
+  // For fp32 types, map to tf32 MMA value type
+  using MmaElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using MmaElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA =
+      detail::gmma_ss_tag_to_major_A<MmaElementA, GmemLayoutA>();
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      detail::gmma_ss_tag_to_major_B<MmaElementB, GmemLayoutB>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+          IsArrayOfPointersGemm,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::ss_op_selector<MmaElementA, MmaElementB, ElementAccumulator, TileShape_MNK,
+                                 GmmaMajorA, GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::ss_smem_selector<GmmaMajorA, MmaElementA,
+                                        decltype(cute::get<0>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB =
+      decltype(detail::ss_smem_selector<GmmaMajorB, MmaElementB,
+                                        decltype(cute::get<1>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_gated<detail::sm90_smem_capacity_bytes, MmaElementA,
+                                                    MmaElementB, TileShape_MNK, SwapAB>(
+          StageCountType{});
+  using DispatchPolicy = cute::conditional_t<
+      IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      /* For FP8 use a separate mainloop compared to other datatypes */
+      cute::conditional_t<IsFP8Input,
+                          MainloopSm90TmaGmmaWarpSpecializedFP8<PipelineStages, ClusterShape_MNK,
+                                                                KernelScheduleType>,
+                          MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK,
+                                                             KernelScheduleType>>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp =
+      CollectiveMmaGated<DispatchPolicy, TileShape_MNK, ElementA, TagToStrideA_t<GmemLayoutA>,
+                         ElementB, TagToStrideB_t<GmemLayoutB>, TiledMma, GmemTiledCopyA,
+                         SmemLayoutAtomA, SmemCopyAtomA, cute::identity, GmemTiledCopyB,
+                         SmemLayoutAtomB, SmemCopyAtomB, cute::identity, Activation, SwapAB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS
+template <class ElementA, class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
+          int AlignmentB, class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType,
+          template <class /* ElementCompute */> class Activation, bool SwapAB>
+struct CollectiveBuilderGated<
+    arch::Sm90, arch::OpClassTensorOp, ElementA, GmemLayoutA, AlignmentA, ElementB, GmemLayoutB,
+    AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType, Activation, SwapAB,
+    cute::enable_if_t<
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType,
+                        KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>>> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutA, ElementB, GmemLayoutB>(),
+                "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA =
+      detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutA>();
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutB>();
+
+  static constexpr bool IsArrayOfPointersGemm =
+      (cute::is_same_v<KernelScheduleType,
+                       KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>);
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum> ||
+          IsArrayOfPointersGemm,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::ss_op_selector<ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA,
+                                 GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::ss_smem_selector<GmmaMajorA, ElementA,
+                                        decltype(cute::get<0>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB =
+      decltype(detail::ss_smem_selector<GmmaMajorB, ElementB,
+                                        decltype(cute::get<1>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_gated<detail::sm90_smem_capacity_bytes, ElementA,
+                                                    ElementB, TileShape_MNK, SwapAB>(
+          StageCountType{});
+  using DispatchPolicy = cute::conditional_t<
+      IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp =
+      CollectiveMmaGated<DispatchPolicy, TileShape_MNK, ElementA, TagToStrideA_t<GmemLayoutA>,
+                         ElementB, TagToStrideB_t<GmemLayoutB>, TiledMma, GmemTiledCopyA,
+                         SmemLayoutAtomA, SmemCopyAtomA, cute::identity, GmemTiledCopyB,
+                         SmemLayoutAtomB, SmemCopyAtomB, cute::identity, Activation, SwapAB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl
new file mode 100644
index 000000000..8d4710fdd
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS Mixed Scaled GEMM
+template <class ElementPairA_, class GmemLayoutATag_, int AlignmentA, class ElementPairB_,
+          class GmemLayoutBTag_, int AlignmentB, class ElementAccumulator, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType, class KernelScheduleType>
+struct CollectiveBuilderInterleaved<
+    arch::Sm90, arch::OpClassTensorOp, ElementPairA_, GmemLayoutATag_, AlignmentA, ElementPairB_,
+    GmemLayoutBTag_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+    StageCountType, KernelScheduleType,
+    cute::enable_if_t<(cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+                       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+                       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>)>> {
+ private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementPairA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementPairB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementPairA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementPairB_>;
+  static constexpr bool NeitherIsTuple =
+      !cute::is_tuple<ElementPairA_>::value && !cute::is_tuple<ElementPairB_>::value;
+
+ public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementPairA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementPairB_>;
+  static_assert(cute::is_tuple<ElementPairA_>::value ^ cute::is_tuple<ElementPairB_>::value ||
+                    (NeitherIsTuple &&
+                     (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value)),
+                "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  using GmemLayoutATag = GmemLayoutATag_;
+  using GmemLayoutBTag = GmemLayoutBTag_;
+
+  using ElementPairA =
+      cute::conditional_t<IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementPairA_>;
+  using ElementPairB =
+      cute::conditional_t<!IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementPairB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB =
+      detail::is_warpspecialized_transpose_B<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag,
+                                             KernelScheduleType>();
+  static_assert(!IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
+
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to RF and we must
+  // swap the operands.
+  static constexpr bool SwapAB = !IsATransformed;
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  using ElementMma = cute::conditional_t<IsATransformed, ElementB, ElementA>;
+  using AtomLayoutMNK =
+      cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+                          Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator, TileShape_MNK,
+                                 GMMA::Major::K, TiledMmaGmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  using RealElementA = cute::conditional_t<SwapAB, ElementB, ElementA>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementA, ElementB>;
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_single_affine_transformed_input<
+          detail::sm90_smem_capacity_bytes, RealElementA, RealElementB, ElementScale, ElementZero,
+          TileShape_MNK>(StageCountType{});
+
+  using SmemCopyAtomA =
+      cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB =
+      cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK,
+                                                        KernelScheduleType>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before
+  // MMA.
+  using StrideA = TagToStrideA_t<GmemLayoutATag>;
+  using StrideB = TagToStrideB_t<GmemLayoutBTag>;
+
+  using CollectiveOp =
+      CollectiveMmaInterleaved<DispatchPolicy, TileShape_MNK, ElementPairA, StrideA, ElementPairB,
+                               StrideB, TiledMma, GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA,
+                               cute::identity, GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB,
+                               cute::identity>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
new file mode 100644
index 000000000..b5acd12e2
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS
+template <class ElementA_, class GmemLayoutATag_, int AlignmentA, class ElementB_,
+          class GmemLayoutBTag_, int AlignmentB, class ElementAccumulator, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType, class KernelScheduleType>
+struct CollectiveBuilderMixedInput<
+    arch::Sm90, arch::OpClassTensorOp, ElementA_, GmemLayoutATag_, AlignmentA, ElementB_,
+    GmemLayoutBTag_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+    StageCountType, KernelScheduleType,
+    cute::enable_if_t<
+        (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedPingpong>) &&
+        (detail::is_use_rmem_A<ElementA_, GmemLayoutATag_, ElementB_, GmemLayoutBTag_>() ||
+         // ConvertAndScale and ConvertAndScaleWithZero
+         cute::is_tuple<ElementA_>::value || cute::is_tuple<ElementB_>::value ||
+         // DirectConvert
+         sizeof_bits<ElementA_>::value != sizeof_bits<ElementB_>::value)>> {
+ private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementB_>;
+  static constexpr bool NeitherIsTuple =
+      !cute::is_tuple<ElementA_>::value && !cute::is_tuple<ElementB_>::value;
+  // Determine if mixed input types.
+  static constexpr bool IsMixedInput =
+      cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementA_>> !=
+      cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementB_>>;
+  static constexpr bool IsArrayOfPointersGemm =
+      cute::is_any_of_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative,
+                        KernelPtrArrayTmaWarpSpecializedPingpong>;
+  static_assert(IsMixedInput || !IsArrayOfPointersGemm,
+                "Only mixed input grouped RS GEMM is supported.");
+
+ public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementB_>;
+
+  static_assert(!IsMixedInput ||
+                    (cute::is_tuple<ElementA_>::value ^ cute::is_tuple<ElementB_>::value ||
+                     (NeitherIsTuple &&
+                      (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value))),
+                "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  template <class T>
+  static auto get_stride(T const& t) {
+    if constexpr (not cute::is_layout<cute::remove_pointer_t<T>>::value) {
+      return t;
+    } else {
+      if constexpr (cute::is_pointer_v<T>) {
+        return &cute::stride(*t);
+      } else {
+        return cute::stride(t);
+      }
+    }
+  }
+
+  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
+  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
+
+  using ElementPairA = cute::conditional_t<IsMixedInput && IsANarrow && NeitherIsTuple,
+                                           cute::tuple<ElementA>, ElementA_>;
+  using ElementPairB = cute::conditional_t<IsMixedInput && !IsANarrow && NeitherIsTuple,
+                                           cute::tuple<ElementB>, ElementB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to rmem and we
+  // must swap the operands.
+  static constexpr bool SwapAB =
+      IsMixedInput ? !IsATransformed
+                   : detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB =
+      detail::is_warpspecialized_transpose_B<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag,
+                                             KernelScheduleType>();
+  static_assert(!IsMixedInput || !IsWarpSpecializedTransposeB,
+                "Mixed input GEMM does not support WS transpose B.");
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  // For fp32 types, map to tf32 MMA value type.
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // Handle mixed dtypes and MMA.
+  using RealElementA = cute::conditional_t<SwapAB, ElementBMma, ElementAMma>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementAMma, ElementBMma>;
+  using RealElementAMma = cute::conditional_t<IsMixedInput, RealElementB, RealElementA>;
+  // Always the same for element B.
+  using RealElementBMma = RealElementB;
+
+  static_assert(!IsMixedInput || TiledMmaGmmaMajorB == GMMA::Major::K ||
+                    sizeof_bits<RealElementB>::value == 16,
+                "Mixed input GEMM does not support MN major layout except for 16bit");
+
+  using AtomLayoutMNK =
+      cute::conditional_t<cute::is_any_of_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative,
+                                            KernelPtrArrayTmaWarpSpecializedCooperative>,
+                          Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<RealElementAMma, RealElementBMma, ElementAccumulator,
+                                 TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr size_t SmemAlignmentA =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutAtomA{});
+  static constexpr size_t SmemAlignmentB =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutAtomB{});
+  static constexpr int SmemAlignment = static_cast<int>(cute::max(SmemAlignmentA, SmemAlignmentB));
+
+  // Handle mixed dtype array GEMM's size of tensor map storage.
+  static constexpr size_t TensorMapStorage = sizeof(cute::TmaDescriptor) * size_t(IsMixedInput) * 4;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes =
+      detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages =
+      IsMixedInput
+          ? (IsArrayOfPointersGemm
+                 ? detail::compute_stage_count_or_override_single_affine_transformed_input<
+                       Sm90ReducedSmemCapacityBytes, RealElementA, RealElementB, ElementScale,
+                       ElementZero, TileShape_MNK, StageCountType::bytes, SmemAlignment>(
+                       StageCountType{})
+                 : detail::compute_stage_count_or_override_single_affine_transformed_input<
+                       detail::sm90_smem_capacity_bytes, RealElementA, RealElementB, ElementScale,
+                       ElementZero, TileShape_MNK, StageCountType::bytes, SmemAlignment>(
+                       StageCountType{}))
+          : detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes, ElementAMma,
+                                                    ElementBMma, TileShape_MNK,
+                                                    StageCountType::bytes, SmemAlignment>(
+                StageCountType{});
+
+  using DispatchPolicy = cute::conditional_t<
+      IsMixedInput,
+      cute::conditional_t<IsArrayOfPointersGemm,
+                          MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<
+                              PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+                          MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<
+                              PipelineStages, ClusterShape_MNK, KernelScheduleType>>,
+      MainloopSm90TmaGmmaRmemAWarpSpecialized<PipelineStages, ClusterShape_MNK,
+                                              KernelScheduleType>>;
+
+  using SmemCopyAtomA =
+      cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB =
+      cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before
+  // MMA.
+  using StrideA =
+      cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutATag_>>::value,
+                          GmemLayoutATag_, TagToStrideA_t<GmemLayoutATag>>;
+  using StrideB =
+      cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutBTag_>>::value,
+                          GmemLayoutBTag_, TagToStrideB_t<GmemLayoutBTag>>;
+
+  using CollectiveOp =
+      CollectiveMmaArrayMixedInput<DispatchPolicy, TileShape_MNK, ElementPairA, StrideA,
+                                   ElementPairB, StrideB, TiledMma, GmemTiledCopyA, SmemLayoutAtomA,
+                                   SmemCopyAtomA, cute::identity, GmemTiledCopyB, SmemLayoutAtomB,
+                                   SmemCopyAtomB, cute::identity>;
+
+  static_assert(SmemAlignment == static_cast<int>(cute::max(CollectiveOp::SmemAlignmentA,
+                                                            CollectiveOp::SmemAlignmentB)));
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
new file mode 100644
index 000000000..61c9ddbce
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_gated.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, template <class /* ElementCompute */> class Activation,
+          bool SwapAB = false, class Enable = void>
+struct CollectiveBuilderGated {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp
new file mode 100644
index 000000000..23c9f5bed
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, class Enable = void>
+struct CollectiveBuilderInterleaved {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
new file mode 100644
index 000000000..e3e3f3459
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, class Enable = void>
+struct CollectiveBuilderMixedInput {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
new file mode 100644
index 000000000..7ca25def0
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB>
+struct CollectiveMmaArrayMixedInput {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
new file mode 100644
index 000000000..94e82be03
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB,
+          template <class /* ElementCompute */> class Activation, bool SwapAB = false>
+struct CollectiveMmaGated {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp
new file mode 100644
index 000000000..3f3266c8e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB>
+struct CollectiveMmaInterleaved {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
new file mode 100644
index 000000000..6964886a8
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -0,0 +1,1474 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass_extensions/detail/collective/mixed_input_utils.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule_, class TileShape_,
+          class ElementAOptionalTuple, class StrideA_, class ElementBOptionalTuple, class StrideB_,
+          class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
+          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_,
+          class TransformB_>
+struct CollectiveMmaArrayMixedInput<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_, ElementAOptionalTuple, StrideA_, ElementBOptionalTuple, StrideB_, TiledMma_,
+    GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+    SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> {
+ public:
+  enum class ConversionMode { DirectConvert, ConvertAndScale, ConvertAndScaleWithZero };
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+ private:
+  template <class T>
+  friend struct detail::MixedGroupedGemmInputUtils;
+  using CollectiveType =
+      CollectiveMmaArrayMixedInput<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
+                                   ElementBOptionalTuple, StrideB_, TiledMma_, GmemTiledCopyA_,
+                                   SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+                                   SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_>;
+  using Utils = detail::MixedGroupedGemmInputUtils<CollectiveType>;
+
+  //
+  // Type Aliases
+  //
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+ public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^
+                    cute::is_tuple<ElementBOptionalTuple>::value,
+                "Either A OR B must be a tuple. It must take the from {ElementOperand, "
+                "[ElementScale], [ElementZero]}. Inputs "
+                "in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when
+  // the scale / zero is void.
+  using NonVoidElementScale =
+      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>,
+                                                 cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(
+      (IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() ||
+                          is_layout<StrideA>::value || is_layout<InternalStrideA>::value)) ||
+          (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() ||
+                               is_layout<StrideB>::value || is_layout<InternalStrideB>::value)),
+      "The transformed type must be K-major.");
+
+  static_assert((IsATransformed && (sizeof(ElementB) == 2)) ||
+                    (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                    ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value ||
+                      is_layout<InternalStrideA>::value) &&
+                     (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value ||
+                      is_layout<InternalStrideB>::value)),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+                "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  static constexpr bool IsMXFP4 = cute::is_same_v<ElementA, cutlass::float_e2m1_t>;
+  // Group size 128 for int4 weights
+  // Group size 32 for mxfp4 weights
+  static constexpr int ScalingGroupSize =
+      IsMXFP4 ? detail::mxfp4_group_size : detail::int4_group_size;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale =
+      uint_bit_t<sizeof_bits_v<NonVoidElementScale>>;  // in case we have array. translating to uint
+                                                       // to satisfy tma descriptor's specialization
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale =
+      Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape =
+      decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  /// Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomA{}, select<0, 2>(TileShape{}), InternalSwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomB{}, select<1, 2>(TileShape{}), InternalSwappedStrideB{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, NonVoidStrideScale>(),
+                          Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop
+  // iteration. We must also handle updating the pipeline transaction bytes on the fly.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+ public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales =
+      KernelConversionMode == ConversionMode::ConvertAndScale ||
+      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable =
+      KernelConversionMode == ConversionMode::ConvertAndScale &&
+      cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr bool UseFP4ToBF16LookupTable =
+      KernelConversionMode == ConversionMode::ConvertAndScale &&
+      cute::is_same_v<ElementA, cutlass::float_e2m1_t> &&
+      cute::is_same_v<ElementB, cutlass::bfloat16_t>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA)
+      cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB)
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    struct TensorMapStorage {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_scale;
+      cute::TmaDescriptor smem_tensormap_zero;
+    };
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // kernel Arguments
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementScale const** ptr_S = nullptr;
+    NonVoidStrideScale const* dS{};
+    int chunk_size = 0;
+    ElementZero const** ptr_Z = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(
+        repeat_like(InternalSwappedStrideA{}, int32_t(0)), InternalSwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(
+        repeat_like(InternalSwappedStrideB{}, int32_t(0)), InternalSwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)),
+                    LayoutA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)),
+                    LayoutB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+
+    using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    void* tensormaps;
+    SwappedElementA const** ptr_A;
+    SwappedStrideA ptr_dA;
+    SwappedElementB const** ptr_B;
+    SwappedStrideB ptr_dB;
+    NonVoidElementScale const** ptr_S;
+    NonVoidStrideScale const* dS;
+    NonVoidElementZero const** ptr_Z;
+    int64_t scale_k;
+    int chunk_size;
+    int reload_factor = (chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape problem_shapes,
+                                                  Arguments const& args, void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create
+    // tensormap/tma desc. These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+
+    if constexpr (SwapAB) {
+      init_M = get<1>(init_shape);
+      init_N = get<0>(init_shape);
+    }
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    SwappedElementA const* ptr_A_first_batch;
+    SwappedElementB const* ptr_B_first_batch;
+    SwappedStrideA ptr_dA;
+    SwappedStrideB ptr_dB;
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+    } else {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+    }
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      if constexpr (not SwapAB) {
+        ptr_dA = args.dA;
+        ptr_dB = args.dB;
+      } else {
+        ptr_dA = args.dB;
+        ptr_dB = args.dA;
+      }
+      dA = InternalSwappedStrideA{};
+      if constexpr (is_layout<InternalSwappedStrideA>::value) {
+        dA = make_layout(transform_leaf(dA.shape(),
+                                        [](auto x) {
+                                          if constexpr (not is_static_v<decltype(x)>) {
+                                            return static_cast<decltype(x)>(1);
+                                          } else {
+                                            return x;
+                                          }
+                                        }),
+                         dA.stride());
+      }
+      dB = InternalSwappedStrideB{};
+    } else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      if constexpr (not SwapAB) {
+        dA = args.dA;
+        dB = args.dB;
+      } else {
+        dA = args.dB;
+        dB = args.dA;
+      }
+      ptr_dA = SwappedStrideA{};
+      ptr_dB = SwappedStrideB{};
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch,
+                                  detail::get_gmem_layout(make_shape(init_M, init_K, mock_L), dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch,
+                                  detail::get_gmem_layout(make_shape(init_N, init_K, mock_L), dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    void* tensormaps = workspace;
+    auto args_setup = [&](auto ptr_A, auto ptr_B, int64_t scale_k = 0, int chunk_size = 0,
+                          int reload_factor = 1) -> Params {
+      return {tma_load_a,
+              tma_load_b,
+              TmaTransactionBytes,
+              tma_load_scale,
+              tma_load_zero,
+              tensormaps,
+              reinterpret_cast<SwappedElementA const**>(ptr_A),
+              ptr_dA,
+              reinterpret_cast<SwappedElementB const**>(ptr_B),
+              ptr_dB,
+              reinterpret_cast<NonVoidElementScale const**>(args.ptr_S),
+              args.dS,
+              reinterpret_cast<NonVoidElementZero const**>(args.ptr_Z),
+              scale_k,
+              chunk_size,
+              reload_factor,
+              dA,
+              dB};
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return SwapAB ? args_setup(args.ptr_B, args.ptr_A) : args_setup(args.ptr_A, args.ptr_B);
+    } else if constexpr (ModeHasScales) {
+      auto fake_scale_k = 1;
+      ElementScale const* ptr_S = reinterpret_cast<ElementScale const*>(args.ptr_S);
+      StrideScale dS{};
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S),
+                                        make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{}, tensor_scale, SmemLayoutScale{}(_, _, cute::Int<0>{}),
+          ScaleTileShape{}, _1{});  // mcast along N mode for this M load, if any
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return SwapAB
+                   ? args_setup(args.ptr_B, args.ptr_A, fake_scale_k, args.chunk_size,
+                                (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                   : args_setup(
+                         args.ptr_A, args.ptr_B, fake_scale_k, args.chunk_size,
+                         (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        ElementZero const* ptr_Z = reinterpret_cast<ElementZero const*>(args.ptr_Z);
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z),
+                                         make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+        tma_load_zero = make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
+                                      SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+                                      _1{});  // mcast along N mode for this M load, if any
+        return SwapAB
+                   ? args_setup(args.ptr_B, args.ptr_A, fake_scale_k, args.chunk_size,
+                                (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                   : args_setup(
+                         args.ptr_A, args.ptr_B, fake_scale_k, args.chunk_size,
+                         (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape, Arguments const& args,
+                                   int sm_count) {
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Calculating workspace size
+    auto calculate_workspace_size = [SizeOfCuTensorMap, sm_count](uint32_t num_input_tensors) {
+      return num_input_tensors * SizeOfCuTensorMap * sm_count;
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies
+      return calculate_workspace_size(2);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies, followed by scale tensormap copies
+      return calculate_workspace_size(3);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies, followed by scale and zeros tensormap copies
+      return calculate_workspace_size(4);
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in get_workspace_size.");
+    }
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const& problem_shape,
+                                              Arguments const& args, void* workspace,
+                                              cudaStream_t stream,
+                                              CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement(ProblemShape problem_shapes,
+                                                Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M, N, K, L] = problem_shape_MNKL;
+        auto get_stride = [](auto stride) {
+          if constexpr (cute::is_pointer_v<cute::decay_t<decltype(stride)>>) {
+            return *stride;
+          } else {
+            return stride;
+          }
+        };
+        auto dA = get_stride(args.dA);
+        auto dB = get_stride(args.dB);
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                 detail::get_gmem_layout(cute::make_shape(M, K, L), dA));
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                 detail::get_gmem_layout(cute::make_shape(N, K, L), dB));
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          implementable = implementable && (args.ptr_S == nullptr);
+          implementable = implementable && (args.ptr_Z == nullptr);
+        } else if constexpr (ModeHasScales) {
+          int const scale_mn = SwapAB ? N : M;
+          int const scale_k = (K + args.chunk_size - 1) / args.chunk_size;
+          constexpr int min_tma_aligned_elements_scale =
+              tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+          implementable =
+              implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(
+                                   cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+          implementable = implementable &&
+                          (args.chunk_size == K || ((args.chunk_size % size<2>(TileShape{})) == 0));
+          implementable = implementable && args.chunk_size != 0;
+          implementable = implementable && (args.ptr_S != nullptr);
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            implementable = implementable && (args.ptr_Z == nullptr);
+          } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            constexpr int min_tma_aligned_elements_zero =
+                tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+            implementable =
+                implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(
+                                     cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+            implementable = implementable && (args.ptr_Z != nullptr);
+          } else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                          "Conversion mode not handled in can_implement.");
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in can_implement.");
+        }
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes =
+      TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(M, K, mock_L), mainloop_params.dA)));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(N, K, mock_L), mainloop_params.dB)));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } else if constexpr (ModeHasScales) {
+      // The real scale_k that actually works
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
+          make_shape(M, scale_k, L));  // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{},
+                                 make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
+            make_shape(M, scale_k, L));  // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{},
+                                   make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in load_init.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <class... Ts, class... TMs, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write, cute::tuple<Ts...> const& load_inputs,
+                           cute::tuple<TMs...> const& input_tensormaps, BlockCoord const& blk_coord,
+                           KTileIterator k_tile_iter, int k_tile_count, int thread_idx,
+                           uint32_t block_rank_in_cluster, TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof...(Ts) == 2, "Direct convert needs two inputs");
+      static_assert(sizeof...(TMs) == 2, "Direct convert needs two tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof...(Ts) == 3, "Scaled convert needs three inputs");
+      static_assert(sizeof...(TMs) == 3, "Scaled convert needs three tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof...(Ts) == 4, "Scaled and zero convert needs four inputs");
+      static_assert(sizeof...(TMs) == 4, "Scaled and zero convert needs four tensormaps");
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                             SmemLayoutB{});                  // (BLK_N,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = as_position_independent_swizzle_tensor(sB_);  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                    block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(
+        mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+      }
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      } else if constexpr (ModeHasScales) {
+        // scale copy
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we
+        // don't modify tma transaction bytes on the fly. We must do a ceiling divide here to
+        // correctly handle with chunk_size == K. In that case, we don't require that K is a
+        // multiple of the threadblock tile K
+        int const scale_load_k = *k_tile_iter / 1;
+        // const int scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will
+        // always be 0 when chunk_size == K.
+        if (cute::elect_one_sync()) {
+          copy(mainloop_params.tma_load_scale.with(get<2>(input_tensormaps), *tma_barrier,
+                                                   mcast_mask_s),
+               tSgS(_, _, _, scale_load_k), tSsS(_, _, _, write_stage));
+        }
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) {
+            copy(mainloop_params.tma_load_zero.with(get<3>(input_tensormaps), *tma_barrier,
+                                                    mcast_mask_s),
+                 tZgZ(_, _, _, scale_load_k), tZsZ(_, _, _, write_stage));
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled for TMA copy op.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for TMA copy op.");
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2,
+                  "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2,
+                  "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(
+        !cute::is_void_v<SwappedSmemCopyAtomA>,
+        "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SwappedSmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA{});                  // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                      size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be "
+                  "NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout =
+        make_layout(Int<MmaWarpGroups>{}, Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma =
+        mma_thread_slice.partition_fragment_A(sA(_, _, Int<0>{}));  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = [&] {
+      if constexpr (not is_layout<InternalSwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      } else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_, _, _, Int<0>{}));
+      }
+    }();
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+    // tCrB is just a view of the tensor tCsB
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA_load);  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info =
+        Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));      // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));      // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));           // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));               // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    multiply_add<ElementAccumulator> fma;
+
+    constexpr int NumMMAsPerChunk = ScalingGroupSize / cute::get<0, 1>(tCsB.shape())();
+    constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / ScalingGroupSize;
+    cute::array<decltype(make_fragment_like(accum)), NumChunksPerTileK> intermediate_array;
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // First k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                             copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) {
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 1, read_stage);
+      }
+
+      // src: tCrA_load, dst: tCrA_mma
+      Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                     intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+          warpgroup_commit_batch();
+
+          if (k_block < K_BLOCK_MAX - 2) {
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          if (k_block < K_BLOCK_MAX - 1) {
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+
+      warpgroup_wait<0>();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+        warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+        // Apply the group-wise scaling
+        // tCrS  ((4, _2, _2), MMA_M, _1)
+        // accum ((2, _2, _2), MMA_M, _1)
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+          for (int m = 0; m < size<0, 1>(accum); m++) {
+            for (int n = 0; n < size<0, 2>(accum); n++) {
+              for (int e = 0; e < size<0, 0>(accum); e++) {
+                auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                if (chunk_id_ == 0) {
+                  accum(accum_coord) = intermediate_array[chunk_id_](accum_coord) *
+                                       static_cast<float>(tCrS(scale_coord)[0]);
+                } else {
+                  accum(accum_coord) =
+                      fma(intermediate_array[chunk_id_](accum_coord),
+                          static_cast<float>(tCrS(scale_coord)[chunk_id_]), accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A
+        // registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 0, smem_pipe_read.index());
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 1, smem_pipe_read.index());
+
+        Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                     intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+          warpgroup_commit_batch();
+
+          if (k_block == K_BLOCK_MAX - 1) {
+            pipeline.consumer_release(
+                smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+            ++smem_pipe_release;
+          }
+
+          if (k_block == 0) {
+            barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+          }
+
+          if (k_block == K_BLOCK_MAX - 1) {
+            // The last k_block
+
+            warpgroup_wait<0>();
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+              warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+              // Apply the group-wise scaling
+              auto tCrS = cute::get<1>(partitioned_extra_info);
+              for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+                for (int m = 0; m < size<0, 1>(accum); m++) {
+                  for (int n = 0; n < size<0, 2>(accum); n++) {
+                    for (int e = 0; e < size<0, 0>(accum); e++) {
+                      auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                      auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                      accum(accum_coord) =
+                          fma(intermediate_array[chunk_id_](accum_coord),
+                              static_cast<float>(tCrS(scale_coord)[chunk_id_]), accum(accum_coord));
+                    }
+                  }
+                }
+              }
+            }
+
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+            // copy scales when passing k_block=0
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, 0, smem_pipe_read.index());
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, 1, smem_pipe_read.index());
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+          } else {
+            if (k_block < K_BLOCK_MAX - 2) {
+              Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+                                     partitioned_extra_info, copy_partitions_extra_info,
+                                     k_block + 2, read_stage);
+            }
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+    }
+
+    {
+      //
+      // Last k tile
+      //
+      Tensor intermediate = make_fragment_like(accum);
+
+      int read_stage = smem_pipe_read.index();
+
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                   intermediate);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                 copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+        }
+
+        if ((k_block + 1) % NumMMAsPerChunk == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+          warpgroup_wait<0>();
+          warpgroup_fence_operand(intermediate);
+
+          // Apply the group-wise scaling
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+            for (int m = 0; m < size<0, 1>(accum); m++) {
+              for (int n = 0; n < size<0, 2>(accum); n++) {
+                for (int e = 0; e < size<0, 0>(accum); e++) {
+                  auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                  auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+                  int scale_idx = k_block / NumMMAsPerChunk;
+
+                  accum(accum_coord) =
+                      fma(intermediate(accum_coord),
+                          static_cast<float>(tCrS(scale_coord)[scale_idx]), accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    // warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto tensormaps_init(Params const& mainloop_params,
+                                      TensorMapStorage& shared_tensormaps, int32_t sm_count,
+                                      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap =
+        reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_scale = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_zero = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    // Bringing tensormaps from params to smem for modification later
+    Tensor pA_tensormap =
+        make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap =
+        make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor pB_tensormap =
+        make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap =
+        make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    if (cute::elect_one_sync()) {
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor pS_tensormap =
+          make_tensor(mainloop_params.tma_load_scale.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sS_tensormap =
+          make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_scale), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pS_tensormap), recast<uint128_t>(sS_tensormap));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      Tensor pZ_tensormap =
+          make_tensor(mainloop_params.tma_load_zero.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sZ_tensormap =
+          make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_zero), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pZ_tensormap), recast<uint128_t>(sZ_tensormap));
+      }
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_init.");
+    }
+
+    __syncwarp();
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale, tma_desc_zero);
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_init.");
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void tensormaps_replace_global_address(TensorMapStorage& shared_tensormaps,
+                                         Params const& mainloop_params, int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                      mainloop_params.ptr_S[next_batch]);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                      mainloop_params.ptr_Z[next_batch]);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_address.");
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by
+  // single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps, Params const& mainloop_params, int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_scale = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_scale = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_zero = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_zero = {0, 0, 0, 0, 0};
+
+    SwappedElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(
+        ptr_A,
+        detail::get_gmem_layout(make_shape(M, K, Int<1>{}), mainloop_params.ptr_dA[next_group]));
+
+    SwappedElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(
+        ptr_B,
+        detail::get_gmem_layout(make_shape(N, K, Int<1>{}), mainloop_params.ptr_dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, prob_shape_A,
+                                             prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, prob_shape_B,
+                                             prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      NonVoidElementScale const* ptr_S = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+      Tensor tensor_scale =
+          make_tensor(detail::get_logical_ptr(ptr_S), make_shape(M, scale_k, Int<1>{}),
+                      mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_scale, tensor_scale,
+                                               prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      ElementZero const* ptr_Z = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+      Tensor tensor_zero =
+          make_tensor(detail::get_logical_ptr(ptr_Z), make_shape(M, scale_k, Int<1>{}),
+                      mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_zero, tensor_zero,
+                                               prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<SwappedElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<SwappedElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_scale) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_zero) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A, prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+          shared_tensormaps.smem_tensormap_scale, prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                              prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+  }
+
+  template <class... TMs, class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_perform_update(TensorMapStorage& shared_tensormaps,
+                                                Params const& mainloop_params,
+                                                cute::tuple<TMs...> const& input_tensormaps,
+                                                ProblemShape_MNKL problem_shape_mnkl,
+                                                int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps, mainloop_params, next_batch,
+                                                    problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class... TMs>
+  CUTLASS_DEVICE void tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps,
+                                                  cute::tuple<TMs...> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      tma_descriptor_cp_fence_release(get<2>(input_tensormaps),
+                                      shared_tensormaps.smem_tensormap_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      tma_descriptor_cp_fence_release(get<3>(input_tensormaps),
+                                      shared_tensormaps.smem_tensormap_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_cp_fence_release.");
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class... TMs>
+  CUTLASS_DEVICE void tensormaps_fence_acquire(cute::tuple<TMs...> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_fence_acquire.");
+    }
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE InputTensors tensors_perform_update(
+      InputTensors const& input_tensors, [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl, [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 000000000..2974895f5
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_, class ElementA_,
+          class StrideA_, class ElementB_, class StrideB_, class TiledMma_, class GmemTiledCopyA_,
+          class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_, class GmemTiledCopyB_,
+          class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_,
+          template <class /* ElementCompute */> class Activation_, bool SwapAB_>
+struct CollectiveMmaGated<MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+                          TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+                          GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_,
+                          GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_,
+                          Activation_, SwapAB_> {
+  static constexpr bool isGated = true;
+  static constexpr bool SwapAB = SwapAB_;
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using Activation = Activation_<ElementAccumulator>;
+
+  using ElementAux = cute::conditional_t<SwapAB, ElementA_, ElementB_>;
+  using ValTypeAux =
+      cute::conditional_t<SwapAB, typename TiledMma::ValTypeA, typename TiledMma::ValTypeB>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutAux = cute::conditional_t<SwapAB, SmemLayoutA, SmemLayoutB>;
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementAux = cute::conditional_t<SwapAB, InternalElementA, InternalElementB>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ValTypeAux, cute::cosize_v<SmemLayoutAux>> smem_Aux;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr),
+                    repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr),
+                    repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    using TMA_Aux = cute::conditional_t<SwapAB, TMA_A, TMA_B>;
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Aux tma_load_aux;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M, K, L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N, K, L), args.dB));
+    typename Params::TMA_A tma_load_a =
+        make_tma_copy(GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                      size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    if constexpr (SwapAB) {
+      auto ptr_Aux =
+          reinterpret_cast<InternalElementA const*>(args.ptr_A + size(make_shape(M, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(M, K, L), args.dA));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyA{}, tensor_aux, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+      return {tma_load_a, tma_load_b, tma_load_aux, args.scale_d0, args.scale_d1};
+    } else {
+      auto ptr_Aux =
+          reinterpret_cast<InternalElementB const*>(args.ptr_B + size(make_shape(N, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(N, K, L), args.dB));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyB{}, tensor_aux, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+      return {tma_load_a, tma_load_b, tma_load_aux, args.scale_d0, args.scale_d1};
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementA>::value)) /
+          8 +
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementB>::value)) /
+          8 +
+      (size<0>(SmemLayoutAux{}) * size<1>(SmemLayoutAux{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementAux>::value)) /
+          8;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_aux.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// gAux_xkl - The tma tensor, A/B after a local tile so it has shape  (BLK_N,BLK_K,m/n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (SwapAB) {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    } else {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <class TensorA, class TensorB, class TensorAux, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write,
+                           cute::tuple<TensorA, TensorB, TensorAux> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                              SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                              SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+      Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+      Tensor gAux_xkl = get<2>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+      auto block_tma_aux = SwapAB
+                               ? mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.y)
+                               : mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.x);
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+      Tensor gAux =
+          SwapAB ? gAux_xkl(_, _, m_coord, _, l_coord) : gAux_xkl(_, _, n_coord, _, l_coord);
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAuxgAux = block_tma_aux.partition_S(gAux);
+      Tensor tAuxsAux = block_tma_aux.partition_D(sAux);
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_aux = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      if constexpr (SwapAB) {
+        mcast_mask_aux = mcast_mask_a;
+      } else {
+        mcast_mask_aux = mcast_mask_b;
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_aux.with(*tma_barrier, mcast_mask_aux),
+             tAuxgAux(_, _, _, *k_tile_iter), tAuxsAux(_, _, _, write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum0, FrgTensorC& accum1, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutAux{}) == 3, "Smem layout must be rank 3.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomA>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                            SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    auto tCsAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.partition_A(sAux);
+      } else {
+        return thread_mma.partition_B(sAux);
+      }
+    }();
+    auto tCrAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.make_fragment_A(tCsAux);
+      } else {
+        return thread_mma.make_fragment_B(tCsAux);
+      }
+    }();
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));    // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));    // PIPE
+    if constexpr (SwapAB) {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<1>(accum1));  // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum1));    // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tCsAux));    // PIPE
+    } else {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum1));    // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<2>(accum1));  // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsAux));    // PIPE
+    }
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sAux));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX),
+                  "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accum0);
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accum1);
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accum1);
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum0);
+      warpgroup_fence_operand(accum1);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accum0);
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accum1);
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accum1);
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write
+      /// is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum0);
+      warpgroup_fence_operand(accum1);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 000000000..61762147e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_, class ElementA_,
+          class StrideA_, class ElementB_, class StrideB_, class TiledMma_, class GmemTiledCopyA_,
+          class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_, class GmemTiledCopyB_,
+          class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_,
+          template <class /* ElementCompute */> class Activation_, bool SwapAB_>
+struct CollectiveMmaGated<
+    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>, TileShape_,
+    ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, GmemTiledCopyA_, SmemLayoutAtomA_,
+    SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_,
+    Activation_, SwapAB_> {
+  static constexpr bool isGated = true;
+  static constexpr bool SwapAB = SwapAB_;
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using Activation = Activation_<ElementAccumulator>;
+
+  using ElementAux = cute::conditional_t<SwapAB, ElementA_, ElementB_>;
+  using ValTypeAux =
+      cute::conditional_t<SwapAB, typename TiledMma::ValTypeA, typename TiledMma::ValTypeB>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutAux = cute::conditional_t<SwapAB, SmemLayoutA, SmemLayoutB>;
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 1 or more.");
+  static_assert(
+      cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ValTypeAux, cute::cosize_v<SmemLayoutAux>> smem_Aux;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)),
+                    StrideA{}),
+        SmemLayoutA{}(_, _, 0), make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)),
+                    StrideB{}),
+        SmemLayoutB{}(_, _, 0), make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    using TMA_Aux = cute::conditional_t<SwapAB, TMA_A, TMA_B>;
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Aux tma_load_aux;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M, K, L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N, K, L), args.dB));
+    typename Params::TMA_A tma_load_a =
+        make_tma_copy(GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                      size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+    if constexpr (SwapAB) {
+      auto ptr_Aux = reinterpret_cast<ElementA const*>(args.ptr_A + size(make_shape(M, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(M, K, L), args.dA));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyA{}, tensor_aux, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+      return {tma_load_a,    tma_load_b,    tma_load_aux,
+              args.scale_d0, args.scale_d1, args.mma_promotion_interval};
+    } else {
+      auto ptr_Aux = reinterpret_cast<ElementB const*>(args.ptr_B + size(make_shape(N, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(N, K, L), args.dB));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyB{}, tensor_aux, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+      return {tma_load_a,    tma_load_b,    tma_load_aux,
+              args.scale_d0, args.scale_d1, args.mma_promotion_interval};
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+    /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4
+     * MMA instructions. */
+    implementable = implementable && (args.mma_promotion_interval % 4 == 0);
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementA>::value)) /
+          8 +
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementB>::value)) /
+          8 +
+      (size<0>(SmemLayoutAux{}) * size<1>(SmemLayoutAux{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementAux>::value)) /
+          8;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_aux.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// gAux_xkl - The tma tensor, A/B after a local tile so it has shape  (BLK_N,BLK_K,m/n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (SwapAB) {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    } else {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <class TensorA, class TensorB, class TensorAux, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write,
+                           cute::tuple<TensorA, TensorB, TensorAux> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                              SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                              SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+      Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+      Tensor gAux_xkl = get<2>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+      auto block_tma_aux = SwapAB
+                               ? mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.y)
+                               : mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+      Tensor gAux =
+          SwapAB ? gAux_xkl(_, _, m_coord, _, l_coord) : gAux_xkl(_, _, n_coord, _, l_coord);
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAuxgAux = block_tma_aux.partition_S(gAux);
+      Tensor tAuxsAux = block_tma_aux.partition_D(sAux);
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_aux = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      if constexpr (SwapAB) {
+        mcast_mask_aux = mcast_mask_a;
+      } else {
+        mcast_mask_aux = mcast_mask_b;
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_aux.with(*tma_barrier, mcast_mask_aux),
+             tAuxgAux(_, _, _, *k_tile_iter), tAuxsAux(_, _, _, write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum0, FrgTensorC& accum1, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomA>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                            SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    auto tCsAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.partition_A(sAux);
+      } else {
+        return thread_mma.partition_B(sAux);
+      }
+    }();
+    auto tCrAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.make_fragment_A(tCsAux);
+      } else {
+        return thread_mma.make_fragment_B(tCsAux);
+      }
+    }();
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));    // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));    // PIPE
+    if constexpr (SwapAB) {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<1>(accum1));  // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum1));    // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tCsAux));    // PIPE
+    } else {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum1));    // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<2>(accum1));  // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsAux));    // PIPE
+    }
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sAux));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX),
+                  "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation0(accum0, mainloop_params.mma_promotion_interval,
+                                      size<2>(tCrA));
+    GmmaFP8Accumulation accumulation1(accum1, mainloop_params.mma_promotion_interval,
+                                      size<2>(tCrA));
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation0.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accumulation0());
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accumulation1());
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accumulation1());
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation0.promote_if_needed();
+      accumulation1.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation0.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation0());
+      warpgroup_fence_operand(accumulation1());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accumulation0());
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accumulation1());
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accumulation1());
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write
+      /// is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation0());
+      warpgroup_fence_operand(accumulation1());
+
+      accumulation0.promote_if_needed();
+      accumulation1.promote_if_needed();
+
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation0.promote_residue_if_needed();
+    accumulation1.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 000000000..7432d90fa
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1528 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_,
+          class ElementAOptionalTuple, class StrideA_, class ElementBOptionalTuple, class StrideB_,
+          class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
+          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_,
+          class TransformB_>
+struct CollectiveMmaInterleaved<
+    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>,
+    TileShape_, ElementAOptionalTuple, StrideA_, ElementBOptionalTuple, StrideB_, TiledMma_,
+    GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+    SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> {
+ private:
+  template <class PointerType>
+  static constexpr auto get_logical_ptr(PointerType const* ptr) {
+    if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
+      return subbyte_iterator<PointerType const>(ptr);
+    } else {
+      return ptr;
+    }
+  }
+
+  template <class WeightType, class ActivationType, class TileShape>
+  static constexpr auto get_smem_interleave_layout() {
+    if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                  cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_4, _4, _2, _4>>,
+                    Stride<_128, Stride<_1, _8, _4, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_2, _4, _4, _2>>,
+                    Stride<_64, Stride<_1, _8, _2, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 8 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_2, _4, _2, _4>>,
+                    Stride<_64, Stride<_1, _4, _2, _16>>>{};
+    } else {
+      static_assert(dependent_false<WeightType, ActivationType>,
+                    "unsupported weight and activation, must be one of w4a8,w4a16,w8a16");
+    }
+  }
+
+  enum class ConversionMode { DirectConvert, ConvertAndScale, ConvertAndScaleWithZero };
+
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+ public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+
+  static_assert(
+      cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value,
+      "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+      "[ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when
+  // the scale / zero is void.
+  using NonVoidElementScale =
+      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using StrideB = StrideB_;
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the code to compile when
+  // the scale is void.
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>,
+                                                 cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert((IsATransformed && cutlass::gemm::detail::is_k_major<StrideA>()) ||
+                    (!IsATransformed && cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The transformed type must be K-major.");
+
+  static_assert((IsATransformed && (sizeof(ElementB) == 2)) ||
+                    (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                    (cutlass::gemm::detail::is_k_major<StrideA>() &&
+                     cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+                "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  // Scale layout atom set after swapping.
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::DefaultCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealInternalElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealInternalElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using InternalTransformA = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using InternalTransformB = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale =
+      Layout<Shape<decltype(cute::shape<0>(InternalSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape =
+      decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+  static constexpr int type_factor = sizeof_bits<ElementB>::value / sizeof_bits<ElementA>::value;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideA>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+
+  using Layout_Interleave = decltype(cute::composition(
+      SmemLayoutA{}.layout_a(), SmemLayoutA{}.offset(),
+      get_smem_interleave_layout<InternalElementA, InternalElementB, TileShape>()));
+  using SmemLayoutA_mma_interleave = decltype(tile_to_shape(
+      Layout_Interleave{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideA>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+  using SmemLayoutA_mma = decltype(cute::composition(
+      SmemLayoutA{}.layout_a(), SmemLayoutA{}.offset(),
+      make_layout(
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+          make_stride(get<2>(TileShape{}), _1{}, get<0>(TileShape{}) * get<2>(TileShape{})))));
+  //  cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(),
+  //                       Stride<_1, cute::Int<shape<0>(TileShape{})>,
+  //                       cute::Int<get<0>(TileShape{}) * get<2>(TileShape{})>>,
+  //                       Stride<cute::Int<shape<2>(TileShape{})>, _1,
+  //                       cute::Int<get<0>(TileShape{}) * get<2>(TileShape{})>>>{})));
+
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideB>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, NonVoidStrideScale>(),
+                          Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop
+  // iteration. We must also handle updating the pipeline transaction bytes on the fly. NOTE:
+  // Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales =
+      KernelConversionMode == ConversionMode::ConvertAndScale ||
+      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after
+  // the public section.
+  static constexpr uint32_t compute_tma_transaction_bytes_mk() {
+    constexpr uint32_t baseline_bytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+                               static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return baseline_bytes;
+    } else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes =
+          cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                 static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0,
+                    "Each scale stage must be 128B aligned.");  // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return baseline_bytes + scale_tx_bytes;
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes =
+            cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                   static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0,
+                      "Each zero stage must be 128B aligned.");  // required by TMA
+        return baseline_bytes + scale_tx_bytes + zero_tx_bytes;
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Type not handled in tma transaction bytes computation.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
+  }
+
+ public:
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = elements_per_smem_scale();
+    static constexpr int zero_elements = elements_per_smem_zero();
+
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB)> {
+      cute::ArrayEngine<RealInternalElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+   private:
+    using Outer =
+        CollectiveMmaInterleaved<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
+                                 ElementBOptionalTuple, StrideB_, TiledMma_, GmemTiledCopyA_,
+                                 SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+                                 SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_>;
+
+   public:
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementA const*>(nullptr)),
+                    repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_Scale = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementB const*>(nullptr)),
+                    repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    } else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(get_logical_ptr(ptr_A), make_layout(make_shape(M, K, L), dA));
+    Tensor tensor_b = make_tensor(get_logical_ptr(ptr_B), make_layout(make_shape(N, K, L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    typename Params::TMA_Scale tma_load_scale;
+    typename Params::TMA_Zero tma_load_zero;
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return {tma_load_a, tma_load_b,          tma_load_scale,        tma_load_zero,        0,
+              0,          TmaTransactionBytes, TmaTransactionBytesMK, TmaTransactionBytesNK};
+    } else if constexpr (ModeHasScales) {
+      auto scale_k = (K + args.group_size - 1) / args.group_size;
+      ElementScale const* ptr_S = args.ptr_S;
+      StrideScale dS = args.dS;
+      Tensor tensor_scale =
+          make_tensor(get_logical_ptr(ptr_S), make_layout(make_shape(M, scale_k, L), dS));
+      tma_load_scale =
+          make_tma_copy(GmemTiledCopyScale{}, tensor_scale, SmemLayoutScale{}(_, _, cute::Int<0>{}),
+                        ScaleTileShape{}, _1{});  // mcast along N mode for this M load, if any
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return {tma_load_a,
+                tma_load_b,
+                tma_load_scale,
+                tma_load_zero,
+                scale_k,
+                args.group_size,
+                TmaTransactionBytes,
+                TmaTransactionBytesMK,
+                TmaTransactionBytesNK};
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero =
+            make_tensor(get_logical_ptr(args.ptr_Z), make_layout(make_shape(M, scale_k, L), dS));
+        tma_load_zero = make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
+                                      SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+                                      _1{});  // mcast along N mode for this M load, if any
+        return {tma_load_a,
+                tma_load_b,
+                tma_load_scale,
+                tma_load_zero,
+                scale_k,
+                args.group_size,
+                TmaTransactionBytes,
+                TmaTransactionBytesMK,
+                TmaTransactionBytesNK};
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      implementable = implementable && (args.ptr_S == nullptr);
+      implementable = implementable && (args.ptr_Z == nullptr);
+    } else if constexpr (ModeHasScales) {
+      int const scale_mn = SwapAB ? N : M;
+      int const scale_k = (K + args.group_size - 1) / args.group_size;
+      constexpr int min_tma_aligned_elements_scale =
+          tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      implementable =
+          implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(
+                               cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+      implementable = implementable &&
+                      (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      implementable = implementable && args.group_size != 0;
+      implementable = implementable && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        implementable = implementable && (args.ptr_Z == nullptr);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero =
+            tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(
+                                 cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+        implementable = implementable && (args.ptr_Z != nullptr);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in can_implement.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in can_implement.");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA prefetch.");
+    }
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
+          make_shape(M, scale_k, L));  // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{},
+                                 make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
+            make_shape(M, scale_k, L));  // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{},
+                                   make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in load_init.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <class... Ts, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write, cute::tuple<Ts...> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof...(Ts) == 2, "Direct convert needs two inputs");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof...(Ts) == 3, "Scaled convert needs three inputs");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof...(Ts) == 4, "Scaled and zero convert needs four inputs");
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA load.");
+    }
+
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                               SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                               SmemLayoutB{});                  // (BLK_N,BLK_K,PIPE)
+      Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = as_position_independent_swizzle_tensor(sB_);  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B and Scales
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_s = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      auto extra_input_partitions = partition_extra_tma_inputs(
+          mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          // Nothing extra to do.
+        } else if constexpr (ModeHasScales) {
+          auto tSgS = get<0>(extra_input_partitions);
+          auto tSsS = get<1>(extra_input_partitions);
+
+          // Temporary factor which will determine which k tile to reload from gmem. Needed so we
+          // don't modify tma transaction bytes on the fly. We must do a ceiling divide here to
+          // correctly handle with group_size == K. In that case, we don't require that K is a
+          // multiple of the threadblock tile K
+          int const ReloadFactor =
+              (mainloop_params.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+          int const scale_load_k =
+              *k_tile_iter / ReloadFactor;  // This will always be 0 when group_size == K.
+          copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s),
+               tSgS(_, _, _, scale_load_k), tSsS(_, _, _, write_stage));
+
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            // Nothing extra to do
+          } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            auto tZgZ = get<2>(extra_input_partitions);
+            auto tZsZ = get<3>(extra_input_partitions);
+            copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s),
+                 tZgZ(_, _, _, scale_load_k), tZsZ(_, _, _, write_stage));
+          } else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                          "Conversion mode not handled for TMA copy op.");
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled for TMA copy op.");
+        }
+
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  template <class WeightType, class ActivationType>
+  constexpr auto interleave_for_mixed_input() {
+    if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                  cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<Shape<_4, _2, _2>, _1, Shape<_2, _2>>,
+                    Stride<Stride<_1, _8, _4>, _0, Stride<_16, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<Shape<_2, _2, _2, _2>, _1, Shape<_2>>,
+                    Stride<Stride<_1, _4, _8, _2>, _0, Stride<_16>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 8 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<Shape<_2, _2, _2>, _1, Shape<_2, _2>>,
+                    Stride<Stride<_1, _4, _2>, _0, Stride<_8, _16>>>{};
+    } else {
+      static_assert(dependent_false<WeightType, ActivationType>,
+                    "unsupported weight and activation, must be one of w4a8,w4a16,w8a16");
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2,
+                  "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2,
+                  "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(
+        !cute::is_void_v<InternalSmemCopyAtomA>,
+        "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
+    static_assert(
+        cute::is_void_v<InternalSmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA_mma_interleave{});   // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                      size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be "
+                  "NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout =
+        make_layout(Int<MmaWarpGroups>{}, Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto interleave_layout = interleave_for_mixed_input<InternalElementA, InternalElementB>();
+
+    auto interleave_remapping =
+        cute::flat_product(interleave_layout, Layout<Shape<Int<DispatchPolicy::Stages>>>{});
+
+    Tensor tCsA_remapped = tCsA.compose(interleave_remapping);
+
+    auto interleave_remapping_thread = right_inverse(interleave_layout);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma =
+        mma_thread_slice.partition_fragment_A(sA(_, _, Int<0>{}));  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = make_fragment_like<RealInternalElementA>(tCrA_mma);
+
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA_load);  // (CPY,CPY_M,CPY_K)
+
+    // Compute the max vector length that can be used to copy A. This will match the vector width of
+    // the conversions used. It helps by allowing the compiler to convert using the same register
+    // that was used to load the data from smem. This significantly reduces the need to move data
+    // among registers. Note that this is correct even if copy fails to vectorize, since the
+    // granularity at which we perform the conversion does not impact correctness.
+    using A_CPY_VEC = decltype(max_common_vector(tCsA, tCrA_copy_view));
+    using A_CPY_VEC_remapped = decltype(max_common_vector(tCsA_remapped, tCrA_copy_view));
+    static_assert(A_CPY_VEC_remapped{} == 32 / cutlass::sizeof_bits<InternalElementA>::value,
+                  "max_common_vector(tCsA_remapped, tCrA_copy_view) is 32 / "
+                  "cutlass::sizeof_bits<InternalElementA>::value");
+    auto tCrA_mma_tmp = tCrA_mma.compose(interleave_remapping_thread);
+    auto tCrA_mma_inverse_mapping = tCrA_mma_tmp.compose(tCrA_mma.layout());
+
+    auto tCrA_load_tmp = tCrA_load.compose(interleave_remapping_thread);
+    auto tCrA_load_inverse_mapping = tCrA_load_tmp.compose(tCrA_load.layout());
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info =
+        retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));      // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));      // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));           // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));               // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+
+    constexpr int kNumKIterationsPerWarpBLoad = type_factor / 2;
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                            partitioned_extra_info, copy_partitions_extra_info, 0, read_stage,
+                            kNumKIterationsPerWarpBLoad);
+      if (K_BLOCK_MAX > 1) {  // prefetch next block
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                              partitioned_extra_info, copy_partitions_extra_info, 1, read_stage,
+                              kNumKIterationsPerWarpBLoad);
+      }
+
+      transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                         kNumKIterationsPerWarpBLoad);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2)  // prefetch next block
+        {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A
+        // registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                              partitioned_extra_info, copy_partitions_extra_info, 0,
+                              smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+        if (K_BLOCK_MAX > 1) {  // prefetch next block
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, 1,
+                                smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+        }
+        warpgroup_wait<K_BLOCK_MAX - kNumKIterationsPerWarpBLoad>();
+        transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                           kNumKIterationsPerWarpBLoad);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_BLOCK_MAX -
+                       kNumKIterationsPerWarpBLoad>();  // We have K_BLOCK_MAX - 1 GMMA instructions
+                                                        // pending for this stage, so we can release
+                                                        // prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, 0,
+                                smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+          if (K_BLOCK_MAX > 1) {  // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                  partitioned_extra_info, copy_partitions_extra_info, 1,
+                                  smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+          }
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                             kNumKIterationsPerWarpBLoad);
+        } else {
+          if (k_block < K_BLOCK_MAX - 2) {  // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                  partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                  read_stage, kNumKIterationsPerWarpBLoad);
+          }
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<K_BLOCK_MAX - kNumKIterationsPerWarpBLoad>();
+        if (k_block == K_BLOCK_MAX - 1)  // release prior barrier
+        {
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2)  // prefetch next block
+        {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+        }
+
+        if (k_block < K_BLOCK_MAX - 1) {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 1,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+ private:
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class... Ts>
+  CUTLASS_DEVICE auto partition_extra_tma_inputs(Params const& mainloop_params,
+                                                 cute::tuple<Ts...> const& load_inputs,
+                                                 TensorStorage& shared_tensors,
+                                                 uint2 const& cluster_local_block_id,
+                                                 int const m_coord, int const l_coord) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);  // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);  // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);  // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for input partitioning.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  template <class ActivationType>
+  constexpr auto scale_remapping() {
+    if constexpr (cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<_4, _2, _2>, Stride<_1, _8, _4>>{};
+    } else if constexpr (cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<_2, _2, _2>, Stride<_1, _4, _2>>{};
+    } else {
+      static_assert(dependent_false<ActivationType>,
+                    "cute::sizeof_bits_v<ActivationType> must be 8 or 16");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma>
+  CUTLASS_DEVICE auto partition_extra_mma_info(ThreadMma const& mma_thread_slice,
+                                               TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      auto remappingScale = scale_remapping<InternalElementB>();
+      Tensor tCsS_remapped = tCsS.compose(remappingScale, _, _, _);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).shape());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS_remapped, tCrS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCsZ_remapped = tCsZ.compose(remappingScale, _, _, _);
+        Tensor tCrZ = make_tensor<ElementZero>(
+            mma_thread_slice.partition_fragment_A(sZ(_, _, Int<0>{})).shape());
+        return cute::make_tuple(tCsS_remapped, tCrS, tCsZ_remapped, tCrZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE auto retile_extra_mma_info(TiledMma const& tiled_mma,
+                                            cute::tuple<Ts...>& partitioned_extra_info,
+                                            int const warp_group_thread_idx) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view =
+          smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view =
+            smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA, class TensorASmemView, class TensorACopyView, class... Ts,
+            class... Us>
+  CUTLASS_DEVICE void copy_A_and_extra_info(SmemTiledCopyA const& smem_tiled_copy_A,
+                                            TensorASmemView const& tCsA,
+                                            TensorACopyView& tCrA_copy_view,
+                                            cute::tuple<Ts...> const& partitioned_mma_extra_info,
+                                            cute::tuple<Us...> const& tiled_copy_and_views,
+                                            int k_block, int read_stage,
+                                            int kNumKIterationsPerWarpBLoad) {
+    if (kNumKIterationsPerWarpBLoad == 1) {
+      copy(smem_tiled_copy_A, tCsA(_, _, k_block, read_stage), tCrA_copy_view(_, _, k_block));
+    } else {
+      using reshape_layout = Layout<Shape<Int<16>, Int<1>, Int<2>>>;
+      auto tCrA_copy_view_reshaped = tCrA_copy_view.compose(reshape_layout{});
+      if (k_block % kNumKIterationsPerWarpBLoad == 0)
+        copy(smem_tiled_copy_A, tCsA(_, _, k_block / kNumKIterationsPerWarpBLoad, read_stage),
+             tCrA_copy_view_reshaped(_, _, k_block / kNumKIterationsPerWarpBLoad));
+    }
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage), tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage), tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  /// Utilities to transform A.
+  template <class TCrA_load, int VectorWidthA, class TCrA_mma, class... Ts>
+  CUTLASS_DEVICE void transform_A_kblock(TCrA_load const& tCrA_load, cute::Int<VectorWidthA> vec_A,
+                                         TCrA_mma& tCrA_mma,
+                                         cute::tuple<Ts...> const& partitioned_extra_info,
+                                         int const k_block, int kNumKIterationsPerWarpBLoad) {
+    if (kNumKIterationsPerWarpBLoad != 1) {
+      if (k_block % kNumKIterationsPerWarpBLoad == 0) {
+        int k_block_load = k_block / kNumKIterationsPerWarpBLoad;
+        using reshape_layout = Layout<Shape<Shape<_2, _2, _2, _2>, _1, _2>>;
+        auto tCrA_load_reshaped = tCrA_load.compose(reshape_layout{});
+        auto tCra_mma_reshaped = tCrA_mma.compose(reshape_layout{});
+
+        using scale_reshape =
+            Layout<Shape<Shape<_2, _2, _2, _2>, _1, _1>, Stride<Stride<_0, _0, _0, _4>, _0, _0>>;
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          transform_internal_A(tCrA_load_reshaped(_, _, k_block_load), vec_A,
+                               tCra_mma_reshaped(_, _, k_block_load));
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          auto tCrS_reshaped = tCrS.compose(scale_reshape{});
+          transform_internal_A(
+              tCrA_load_reshaped(_, _, k_block_load), vec_A,
+              make_fragment_like<ElementScale>(tCra_mma_reshaped)(_, _, k_block_load),
+              tCrS_reshaped(_, _, 0), tCra_mma_reshaped(_, _, k_block_load));
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          auto tCrS_reshaped = tCrS.compose(scale_reshape{});
+          auto tCrZ = cute::get<3>(partitioned_extra_info);
+          auto tCrZ_reshaped = tCrZ.compose(scale_reshape{});
+          transform_internal_A(
+              tCrA_load_reshaped(_, _, k_block_load), vec_A,
+              make_fragment_like<ElementScale>(tCra_mma_reshaped)(_, _, k_block_load),
+              tCrS_reshaped(_, _, 0), tCrZ_reshaped(_, _, 0),
+              tCra_mma_reshaped(_, _, k_block_load));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+        }
+      }
+    } else {
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A, tCrA_mma(_, _, k_block));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A,
+                             make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block),
+                             tCrS(_, _, 0), tCrA_mma(_, _, k_block));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        auto tCrZ = cute::get<3>(partitioned_extra_info);
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A,
+                             make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block),
+                             tCrS(_, _, 0), tCrZ(_, _, 0), tCrA_mma(_, _, k_block));
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+      }
+    }
+  }
+
+  /// Utilities for transforming the A operand prior to issuing tensorcore math.
+  template <class EngineIn, class EngineOut, class TensorLayout,
+            int ConversionVectorWidth = cosize_v<TensorLayout>>
+  CUTLASS_DEVICE void convert_tensor(Tensor<EngineIn, TensorLayout> const& in,
+                                     Tensor<EngineOut, TensorLayout>& out,
+                                     cute::Int<ConversionVectorWidth> width = {}) {
+    /// This is an element-wise conversion where we expect both tensors to have the same layout.
+    /// As a result, we can cast as a cutlass array to use the fast numeric converters without
+    /// worrying about indexing into the layout.
+    constexpr int N = cosize_v<TensorLayout>;
+
+    /// The inputs must be backed by registers & be statically sized.
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(is_static_v<TensorLayout>, "Tensor layout for the conversion must be static");
+    static_assert(cosize_v<TensorLayout> == size(TensorLayout{}),
+                  "Cosize and size of the layout must be equal.");
+    static_assert(N % ConversionVectorWidth == 0,
+                  "Conversion vector width must divide cosize of the tensor layout.");
+
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    using SrcArray = cutlass::Array<SrcType, ConversionVectorWidth>;
+    using DstArray = cutlass::Array<DstType, ConversionVectorWidth>;
+
+    using Converter =
+        std::conditional_t < cutlass::sizeof_bits_v<SrcType>
+        <cutlass::sizeof_bits_v<DstType>,
+         cutlass::FastInterleavedAndBiasedNumericArrayConverter<DstType, SrcType,
+                                                                ConversionVectorWidth>,
+         cutlass::NumericArrayConverter<DstType, SrcType, ConversionVectorWidth>>;
+
+    constexpr int NumIterations = N / ConversionVectorWidth;
+
+    for (int ii = 0; ii < NumIterations; ++ii) {
+      SrcArray const* src_array_ptr =
+          reinterpret_cast<SrcArray const*>(raw_pointer_cast(in.data())) + ii;
+      DstArray* dst_array_ptr = reinterpret_cast<DstArray*>(raw_pointer_cast(out.data())) + ii;
+      *dst_array_ptr = Converter::convert(*src_array_ptr);
+    }
+  }
+
+  template <class EngineIn, class EngineOut, class TensorLayout, int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(Tensor<EngineIn, TensorLayout>&& in,
+                                           cute::Int<A_VectorConversionWidth> a_vec_width,
+                                           Tensor<EngineOut, TensorLayout>&& out) {
+    convert_tensor(in, out, a_vec_width);
+  }
+
+  template <class EngineIn, class EngineInputBuffer, class EngineScale, class EngineOut,
+            class TensorLayout, class TensorScaleLayout, int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(
+      Tensor<EngineIn, TensorLayout>&& in, cute::Int<A_VectorConversionWidth> a_vec_width,
+      Tensor<EngineInputBuffer, TensorLayout>&& converted_inputs,
+      Tensor<EngineScale, TensorScaleLayout>&& scales, Tensor<EngineOut, TensorLayout>&& out) {
+    static_assert(
+        cute::is_same_v<typename EngineInputBuffer::value_type, typename EngineScale::value_type>,
+        "Type of the engine input buffer must equal the scale buffer");
+
+    // First, we upcast the inputs to the scale type
+    convert_tensor(in, converted_inputs, a_vec_width);
+
+    // Apply scales and broadcast across inputs, store in converted_inputs
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(converted_inputs); ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size<0>(converted_inputs); ++j) {
+        if constexpr (cute::is_same_v<typename EngineScale::value_type, cutlass::bfloat16_t>) {
+          converted_inputs(j, i) =
+              bfloat16_t(__hmul(reinterpret_cast<__nv_bfloat16 const&>(converted_inputs(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(scales(j, i))));
+        } else {
+          converted_inputs(j, i) *= scales(j, i);
+        }
+      }
+    }
+
+    // Finally, we convert the scaled inputs to the mma type.
+    convert_tensor(converted_inputs, out);
+  }
+
+  template <class EngineIn, class EngineInputBuffer, class EngineScale, class EngineZero,
+            class EngineOut, class TensorLayout, class TensorScaleLayout,
+            int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(
+      Tensor<EngineIn, TensorLayout>&& in, cute::Int<A_VectorConversionWidth> a_vec_width,
+      Tensor<EngineInputBuffer, TensorLayout>&& converted_inputs,
+      Tensor<EngineScale, TensorScaleLayout>&& scales,
+      Tensor<EngineZero, TensorScaleLayout>&& zeros, Tensor<EngineOut, TensorLayout>&& out) {
+    static_assert(
+        cute::is_same_v<typename EngineInputBuffer::value_type, typename EngineScale::value_type>,
+        "Type of the engine input buffer must equal the scale buffer");
+
+    static_assert(
+        cute::is_same_v<typename EngineZero::value_type, typename EngineScale::value_type>,
+        "Type of the engine zero buffer must equal the scale buffer");
+
+    // First, we upcast the inputs to the scale type
+    convert_tensor(in, converted_inputs, a_vec_width);
+
+    // Apply scales and broadcast across inputs, store in converted_inputs
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(converted_inputs); ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size<0>(converted_inputs); ++j) {
+        if constexpr (cute::is_same_v<typename EngineScale::value_type, cutlass::bfloat16_t>) {
+          converted_inputs(j, i) =
+              bfloat16_t(__hfma(reinterpret_cast<__nv_bfloat16 const&>(converted_inputs(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(scales(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(zeros(j, i))));
+        } else {
+          converted_inputs(j, i) = converted_inputs(j, i) * scales(j, i) + zeros(j, i);
+        }
+      }
+    }
+
+    // Finally, we convert the scaled inputs to the mma type.
+    convert_tensor(converted_inputs, out);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
index eb653b416..3e291281e 100644
--- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -262,7 +262,6 @@ enum class ClusterShape {
   ClusterShape_1x2x1,
   ClusterShape_2x2x1,
   ClusterShape_1x4x1,
-  ClusterShape_4x1x1,
   ClusterShape_4x2x1,
   ClusterShape_2x4x1,
   ClusterShape_4x4x1,
@@ -279,8 +278,6 @@ static auto get_cluster_shape_name(ClusterShape Shape_MNK) {
     return "1x2x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
     return "2x2x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_4x1x1) {
-    return "4x1x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
     return "1x8x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
@@ -300,8 +297,6 @@ constexpr auto get_cluster_shape() {
     return cute::Shape<_1, _2, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
     return cute::Shape<_2, _2, _1>{};
-  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_4x1x1) {
-    return cute::Shape<_4, _1, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
     return cute::Shape<_1, _8, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
@@ -378,8 +373,8 @@ struct CutlassGemmConfig {
         is_tma_warp_specialized(true) {}
 
   int getTileConfigAsInt() const {
-    if (sm_version == 120 || sm_version == 121) return (int)tile_config_sm120;
-    if (sm_version >= 100 && sm_version < 120) return (int)tile_config_sm100;
+    if (sm_version == 120) return (int)tile_config_sm120;
+    if (sm_version >= 100) return (int)tile_config_sm100;
     if (sm_version == 90) return (int)tile_config_sm90;
     if (sm_version < 90) return (int)tile_config_sm80;
     assert(false && "Invalid SM version");
@@ -416,22 +411,22 @@ struct CutlassGemmConfig {
 
 inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& config) {
   // clang-format off
-    if (config.is_tma_warp_specialized)
-    {
-        out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
-            << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
-            << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
-            << ", cluster_shape_enum: " << int(config.cluster_shape)
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
-    else
-    {
-        out << "tile_config_enum: " << config.getTileConfigAsInt()
-            << ", split_k_style_enum: " << int(config.split_k_style)
-            << ", split_k_factor: " << config.split_k_factor
-            << ", stages: " << config.stages
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
+     if (config.is_tma_warp_specialized)
+     {
+         out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
+             << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
+             << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
+             << ", cluster_shape_enum: " << int(config.cluster_shape)
+             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+     }
+     else
+     {
+         out << "tile_config_enum: " << config.getTileConfigAsInt()
+             << ", split_k_style_enum: " << int(config.split_k_style)
+             << ", split_k_factor: " << config.split_k_factor
+             << ", stages: " << config.stages
+             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+     }
   // clang-format on
   return out;
 }
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..63864ea3e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..e02b80beb
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu
new file mode 100644
index 000000000..418f7bde7
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..20a228405
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..9ea1ad39d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu
new file mode 100644
index 000000000..ec68288b1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu
new file mode 100644
index 000000000..0862b0979
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS,
+                                        half,          /*Scale and Zero Type*/
+                                        __nv_bfloat16, /*Bias type Type*/
+                                        __nv_bfloat16  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu
new file mode 100644
index 000000000..10a61b5ca
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu
new file mode 100644
index 000000000..3c8bde88b
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY,
+                                        half,          /*Scale and Zero Type*/
+                                        __nv_bfloat16, /*Bias type Type*/
+                                        __nv_bfloat16  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu
new file mode 100644
index 000000000..292585ef1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu
new file mode 100644
index 000000000..5d044fd3d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..0ee34abc2
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..0658cd080
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu
new file mode 100644
index 000000000..d61be691d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..98ef82ae1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..af0673bbc
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu
new file mode 100644
index 000000000..0b453ff9a
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
new file mode 100644
index 000000000..e535bdfa1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <vector>
+
+#include "../include/common.h"
+#include "cutlass_extensions/gemm_configs.h"
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+/*
+  This runner only supports:
+  T in {half, __nv_bfloat} WeightType in {int8_t, cutlass::uint4b_t}
+
+  Activations, biases, scales and outputs are all assumed to be row-major.
+
+  However, it is assumed that B is in a special format governed by
+  cutlass_extensions/gemm/kernel/mixed_gemm_B_layout. In this case, B must be preprocessed using the
+  cutlass weight only quant preprocessors. The weight preprocessor will instantiate the layout and
+  preprocess based on the instantiation, so layout changes should only require modifications to
+  mix_gemm_B_layout.h.
+*/
+
+class CutlassFpAIntBGemmRunnerInterface {
+ public:
+  CutlassFpAIntBGemmRunnerInterface() {}
+
+  virtual ~CutlassFpAIntBGemmRunnerInterface() {}
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales, void* C, int m, int n,
+                    int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+                    const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales, float const alpha,
+                    void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales,
+                    void const* weight_zero_points, void const* biases, void* C, int m, int n,
+                    int k, int const group_size, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales,
+                    void const* weight_zero_points, void const* biases, float const alpha, void* C,
+                    int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  // Returns desired workspace size in bytes.
+  virtual size_t getWorkspaceSize(int const m, int const n, int const k) = 0;
+
+  virtual std::vector<tkc::CutlassGemmConfig> getConfigs() const = 0;
+
+ protected:
+  static constexpr int SPLIT_K_LIMIT = 7;
+  static constexpr int MIN_M_TILE = 16;
+  static constexpr int MIN_N_TILE = 64;
+
+  static constexpr int MAX_M_TILE_SM90 = 128;
+  static constexpr int MAX_N_TILE_SM90 = 256;
+};
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType = ActivationType, typename BiasType = ActivationType,
+          typename OutputType = ActivationType>
+class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterface {
+ public:
+  CutlassFpAIntBGemmRunner();
+  ~CutlassFpAIntBGemmRunner();
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void* C, int m, int n, int k,
+            tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+            cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, float const alpha, void* C,
+            int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+            const size_t workspace_bytes, cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void const* weight_zero_points,
+            void const* biases, void* C, int m, int n, int k, int const group_size,
+            tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+            cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void const* weight_zero_points,
+            void const* biases, float const alpha, void* C, int m, int n, int k,
+            int const group_size, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+            const size_t workspace_bytes, cudaStream_t stream) override;
+
+  // Disabled since the fused GEMM, activation kernels will not be used in v1.
+
+  // void gemm_bias_act(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T*
+  // C, int m, int n,
+  //     int k, ActivationType activation_type, char* workspace_ptr, const size_t workspace_bytes,
+  //     cudaStream_t stream);
+
+  // Returns desired workspace size in bytes.
+  size_t getWorkspaceSize(int const m, int const n, int const k) override;
+
+  std::vector<tkc::CutlassGemmConfig> getConfigs() const override;
+
+ private:
+  template <typename EpilogueTag>
+  void dispatch_to_arch(ActivationType const* A, WeightType const* B,
+                        ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points,
+                        BiasType const* biases, float const alpha, OutputType* C, int m, int n,
+                        int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+                        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream,
+                        int* occupancy = nullptr);
+
+ private:
+  int sm_;
+  int multi_processor_count_;
+};
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
new file mode 100644
index 000000000..0fc953671
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/device/gemm_universal_base_compat.h"
+#include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
+#include "cutlass_extensions/gemm/kernel/fpA_intB_gemm.h"
+#include "cutlass_extensions/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/gemm_configs.h"
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h"
+
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape, int Stages>
+void generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+#ifdef ENABLE_BF16
+  static_assert(
+#ifdef ENABLE_FP8
+      cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value ||
+#endif
+          cutlass::platform::is_same<ActivationType, __nv_bfloat16>::value ||
+          cutlass::platform::is_same<ActivationType, half>::value ||
+          cutlass::platform::is_same<ActivationType, float>::value,
+      "Specialized for bfloat16, half, float");
+#else
+  static_assert(cutlass::platform::is_same<ActivationType, half>::value ||
+                    cutlass::platform::is_same<ActivationType, float>::value,
+                "Specialized for half, float");
+#endif
+
+  static_assert(cutlass::platform::is_same<ActivationType, WeightType>::value ||
+                    cutlass::platform::is_same<WeightType, uint8_t>::value ||
+                    cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value,
+                "");
+
+  // The cutlass type for the input elements. This is needed to convert to cutlass::half_t if
+  // necessary.
+  using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
+  using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
+  using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter<ScaleZeroType>::type;
+  using CutlassBiasType = typename TllmToCutlassTypeAdapter<BiasType>::type;
+  using CutlassOutputType = typename TllmToCutlassTypeAdapter<OutputType>::type;
+
+  // We need separate config for each architecture since we will target different tensorcore
+  // instructions. For float, we do not target TCs.
+  using MixedGemmArchTraits =
+      cutlass::gemm::kernel::MixedGemmArchTraits<CutlassActivationType, CutlassWeightType, arch>;
+  using ElementAccumulator = typename MixedGemmArchTraits::AccType;
+
+  constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits<CutlassOutputType>::value;
+  using EpilogueOp = typename tkc::Epilogue<CutlassOutputType, ElementsPerAccessC,
+                                            ElementAccumulator, EpilogueTag>::Op;
+
+  using Operator = typename MixedGemmArchTraits::Operator;
+  using TaggedOperator = typename cutlass::arch::TagOperator<Operator, QuantOp>::TaggedOperator;
+
+  using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemm<
+      CutlassActivationType, cutlass::layout::RowMajor, MixedGemmArchTraits::ElementsPerAccessA,
+      CutlassWeightType, typename MixedGemmArchTraits::LayoutB,
+      MixedGemmArchTraits::ElementsPerAccessB, CutlassOutputType, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, arch, ThreadblockShape, WarpShape,
+      typename MixedGemmArchTraits::InstructionShape, EpilogueOp,
+      typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, Stages, true,
+      TaggedOperator>::GemmKernel;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmFpAIntB<typename GemmKernel_::Mma, typename GemmKernel_::Epilogue,
+                                         typename GemmKernel_::ThreadblockSwizzle,
+                                         arch,  // Ensure top level arch is used for dispatch
+                                         GemmKernel_::kSplitKSerial>;
+
+  if (occupancy != nullptr) {
+    *occupancy = tensorrt_llm::cutlass_extensions::compute_occupancy_for_kernel<GemmKernel>();
+    return;
+  }
+
+  using Gemm = cutlass::gemm::device::GemmUniversalBaseCompat<GemmKernel>;
+
+  int const ldb = cutlass::platform::is_same<cutlass::layout::RowMajor,
+                                             typename MixedGemmArchTraits::LayoutB>::value
+                      ? n
+                      : k * GemmKernel::kInterleave;
+
+  if (weight_scales == nullptr) {
+    throw std::runtime_error("Weight scales must always be set to a non-null value.");
+  }
+
+  if constexpr (cutlass::isFinegrained(QuantOp)) {
+    if constexpr (cutlass::platform::is_same<CutlassActivationType, float_e4m3_t>::value) {
+      if (group_size != 128) {
+        throw std::runtime_error(
+            "Only group size 128 supported for fine grained W4A(fp)8 kernels.");
+      }
+    }
+    if (group_size != 64 && group_size != 128) {
+      throw std::runtime_error("Only group size 64 and 128 supported for fine grained kernels.");
+    }
+
+    if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) {
+      if (weight_zero_points != nullptr) {
+        throw std::runtime_error(
+            "Weight zero pointer must be a nullptr for scale only fine grained");
+      }
+    } else if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) {
+      if (weight_zero_points == nullptr) {
+        throw std::runtime_error(
+            "Weight zero pointer must be valid for scale and bias fine grained");
+      }
+    }
+  } else {
+    if (group_size != k) {
+      throw std::runtime_error("Invalid group size for per column scaling kernels.");
+    }
+
+    if (weight_zero_points != nullptr) {
+      throw std::runtime_error("Weight zero-points must be null when running per column scaling");
+    }
+  }
+
+  int const ld_scale_zero = cutlass::isFinegrained(QuantOp) ? n : 0;
+  ElementAccumulator output_op_beta =
+      (biases == nullptr) ? ElementAccumulator(0.f) : ElementAccumulator(1.f);
+  typename Gemm::Arguments args(
+      {m, n, k}, group_size,
+      {reinterpret_cast<CutlassActivationType*>(const_cast<ActivationType*>(A)), k},
+      {reinterpret_cast<CutlassWeightType*>(const_cast<WeightType*>(B)), ldb},
+      {reinterpret_cast<CutlassScaleZeroType*>(const_cast<ScaleZeroType*>(weight_scales)),
+       ld_scale_zero},
+      {reinterpret_cast<CutlassScaleZeroType*>(const_cast<ScaleZeroType*>(weight_zero_points)),
+       ld_scale_zero},
+      {reinterpret_cast<CutlassBiasType*>(const_cast<BiasType*>(biases)), 0},
+      {reinterpret_cast<CutlassOutputType*>(C), n}, gemm_config.split_k_factor,
+      {ElementAccumulator(alpha), output_op_beta});
+
+  // This assertion is enabled because because for the column interleaved layout, K MUST be a
+  // multiple of threadblockK. The reason for this is that the default pitchlinear iterators are
+  // used to handle walking over the interleaved matrix. The way masking in handled in these do not
+  // map to the interleaved layout. We need to write our own predicated iterator in order to relax
+  // this limitation.
+  if (GemmKernel::kInterleave > 1 &&
+      ((k % MixedGemmArchTraits::ThreadblockK) ||
+       ((k / gemm_config.split_k_factor) % MixedGemmArchTraits::ThreadblockK))) {
+    throw std::runtime_error("Temp assertion: k must be multiple of threadblockK");
+  }
+
+  Gemm gemm;
+  if (gemm.get_workspace_size(args) > workspace_bytes) {
+    TLLM_LOG_WARNING(
+        "Requested split-k but workspace size insufficient. Falling back to non-split-k "
+        "implementation.");
+    // If requested split-k factor will require more workspace bytes, revert to standard gemm.
+    args.batch_count = 1;
+  }
+
+  auto can_implement = gemm.can_implement(args);
+  if (can_implement != cutlass::Status::kSuccess) {
+    std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
+                          std::string(cutlassGetStatusString(can_implement));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+
+  auto init_status = gemm.initialize(args, workspace, stream);
+  if (init_status != cutlass::Status::kSuccess) {
+    std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
+                          std::string(cutlassGetStatusString(init_status));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+
+  auto run_status = gemm.run(stream);
+  if (run_status != cutlass::Status::kSuccess) {
+    std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
+                          std::string(cutlassGetStatusString(run_status));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+}
+
+// This filters out invalid template combinations that we DON'T want instantiated in CUTLASS. For
+// example, instantiating SM=75, Stages=3 is invalid so we would need to filter that out. Fine
+// grained quanitzation is only supported on Ampere+ GPUs. FP8 GEMM is only supported on Ada+ GPUs.
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape, int Stages>
+void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B,
+                               ScaleZeroType const* weight_scales,
+                               ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                               float const alpha, OutputType* C, int m, int n, int k,
+                               int const group_size, tkc::CutlassGemmConfig gemm_config,
+                               char* workspace, size_t workspace_bytes, cudaStream_t stream,
+                               int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  if constexpr (Stages > 2 && arch::kMinComputeCapability < 80) {
+    // Multistage only supported on Ampere
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) + " with stages set to " +
+                          std::to_string(Stages);
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89) {
+    // Multistage only supported on Ampere
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) + " with stages set to " +
+                          std::to_string(Stages);
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value &&
+                       arch::kMinComputeCapability < 89) {
+    // FP8 activation type only supported on Ada+ GPUs
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) +
+                          " with activation type set to FP8";
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else {
+    generic_mixed_gemm_kernelLauncher<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                      OutputType, arch, QuantOp, EpilogueTag, ThreadblockShape,
+                                      WarpShape, Stages>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size, gemm_config,
+        workspace, workspace_bytes, stream, occupancy);
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape>
+void dispatch_gemm_config(ActivationType const* A, WeightType const* B,
+                          ScaleZeroType const* weight_scales,
+                          ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                          float const alpha, OutputType* C, int m, int n, int k,
+                          int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace,
+                          size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  switch (gemm_config.stages) {
+    case 2:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 2>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case 3:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 3>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case 4:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 4>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      std::string err_msg =
+          "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
+      throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
+      break;
+  }
+}
+
+template <typename T>
+constexpr bool is_fp8() {
+  return std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>;
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag>
+void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
+                              ScaleZeroType const* weight_scales,
+                              ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                              float const alpha, OutputType* C, int m, int n, int k,
+                              int const group_size, char* workspace, size_t workspace_bytes,
+                              tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
+                              int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+  // Don't instantiate configs that are not supported pre-hopper. Produce a sensible error instead.
+  constexpr bool any_is_fp8 = is_fp8<ActivationType>() || is_fp8<WeightType>() ||
+                              is_fp8<ScaleZeroType>() || is_fp8<BiasType>() || is_fp8<OutputType>();
+
+  constexpr bool all_types_are_the_same = std::is_same_v<ActivationType, ScaleZeroType> &&
+                                          std::is_same_v<ActivationType, BiasType> &&
+                                          std::is_same_v<ActivationType, OutputType>;
+
+  constexpr bool is_valid_pre_hopper =
+      (all_types_are_the_same && !any_is_fp8) || (arch::kMinComputeCapability == 89);
+
+  if constexpr (is_valid_pre_hopper) {
+    // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
+    // We also only instantiate configs here where threadblockShapeM == warpShapeM since those
+    // usually perform the best for mixed type gemms.
+    constexpr int tile_shape_k = 128 * 8 / cutlass::sizeof_bits<ActivationType>::value;
+    switch (gemm_config.tile_config_sm80) {
+      case tkc::CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<16, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<16, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<16, 256, tile_shape_k>,
+                             cutlass::gemm::GemmShape<16, 64, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<32, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<32, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<64, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<64, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<128, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<128, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::Undefined:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
+        break;
+      case tkc::CutlassTileConfig::ChooseWithHeuristic:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
+            "already been set by "
+            "heuristic.");
+        break;
+      default:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
+            "type GEMM.");
+        break;
+    }
+  } else {
+    // This is not a limitation in CUTLASS. We just do not need to support this case.
+    std::string err_msg =
+        "The activation type must equal the scale, bias and output types on Ampere and earlier.";
+    throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                         OutputType>::CutlassFpAIntBGemmRunner() {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  int device{-1};
+  tk::check_cuda_error(cudaGetDevice(&device));
+  sm_ = tk::getSMVersion();
+  tk::check_cuda_error(
+      cudaDeviceGetAttribute(&multi_processor_count_, cudaDevAttrMultiProcessorCount, device));
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                         OutputType>::~CutlassFpAIntBGemmRunner() {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+template <typename EpilogueTag>
+void CutlassFpAIntBGemmRunner<
+    ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+    OutputType>::dispatch_to_arch<EpilogueTag>(ActivationType const* A, WeightType const* B,
+                                               ScaleZeroType const* weight_scales,
+                                               ScaleZeroType const* weight_zero_points,
+                                               BiasType const* biases, float const alpha,
+                                               OutputType* C, int m, int n, int k,
+                                               int const group_size,
+                                               tkc::CutlassGemmConfig gemm_config,
+                                               char* workspace_ptr, const size_t workspace_bytes,
+                                               cudaStream_t stream, int* occupancy) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  if (sm_ >= 75 && sm_ < 80) {
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm75, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if ((sm_ >= 80 && sm_ < 89) || sm_ >= 100) {
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm80, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if (sm_ == 89) {
+#if ENABLE_FP8 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4))
+    if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value) {
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada "
+          "needs "
+          "CUDA>=12.4");
+    }
+#endif
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm89, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if (sm_ == 90) {
+    static_assert(!cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value ||
+                      cutlass::platform::is_same<ScaleZeroType, half>::value,
+                  "ScaleZeroType must be half for activation=fp8");
+    sm90_dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                  QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else {
+    throw std::runtime_error(
+        "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for "
+        "CUTLASS mixed type "
+        "GEMM");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales,
+                                                void const* weight_zero_points, void const* biases,
+                                                float const alpha, void* C, int m, int n, int k,
+                                                int const group_size,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) ||
+                (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)) {
+    dispatch_to_arch<tkc::EpilogueOpBias>(
+        (ActivationType const*)A, (WeightType const*)B, (ScaleZeroType const*)weight_scales,
+        (ScaleZeroType const*)weight_zero_points, (BiasType const*)biases, alpha, (OutputType*)C, m,
+        n, k, group_size, gemmConfig, workspace_ptr, workspace_bytes, stream, nullptr);
+  } else {
+    throw std::runtime_error(
+        "Overload with scale, zero and group size only supported for fine grained bias template.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales,
+                                                void const* weight_zero_points, void const* biases,
+                                                void* C, int m, int n, int k, int const group_size,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  gemm(A, B, weight_scales, weight_zero_points, biases, 1.f, C, m, n, k, group_size, gemmConfig,
+       workspace_ptr, workspace_bytes, stream);
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales, float const alpha,
+                                                void* C, int m, int n, int k,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+  if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY) {
+    dispatch_to_arch<tkc::EpilogueOpBias>((ActivationType const*)A, (WeightType const*)B,
+                                          (ScaleZeroType const*)weight_scales, nullptr, nullptr,
+                                          alpha, (OutputType*)C, m, n, k, k, gemmConfig,
+                                          workspace_ptr, workspace_bytes, stream, nullptr);
+  } else {
+    throw std::runtime_error(
+        "Overload with scale only (and no group size) only supported for per column scaling.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales, void* C, int m, int n,
+                                                int k, tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  gemm(A, B, weight_scales, 1.f, C, m, n, k, gemmConfig, workspace_ptr, workspace_bytes, stream);
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+std::vector<tkc::CutlassGemmConfig> CutlassFpAIntBGemmRunner<
+    ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType, OutputType>::getConfigs() const {
+  static constexpr bool is_weight_only = !std::is_same<ActivationType, WeightType>::value;
+  tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param =
+      tkc::CutlassGemmConfig::CandidateConfigTypeParam::HOPPER;
+  if (is_weight_only) {
+    config_type_param = static_cast<tkc::CutlassGemmConfig::CandidateConfigTypeParam>(
+        config_type_param | tkc::CutlassGemmConfig::CandidateConfigTypeParam::WEIGHT_ONLY);
+  }
+  std::vector<tkc::CutlassGemmConfig> candidateConfigs =
+      get_candidate_configs(sm_, SPLIT_K_LIMIT, config_type_param);
+  return candidateConfigs;
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+size_t CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                                OutputType>::getWorkspaceSize(int const m, int const n,
+                                                              int const k) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  // For Hopper, we have to allocate large memory size in case for stream-K
+  if (sm_ == 90) {
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L878-L892
+    // The above lines says sk_tiles = output_tiles - (static_cast<uint32_t>(output_tiles /
+    // ctas_per_wave) - 1) * ctas_per_wave This means sk_tiles is at most 2 * ctas_per_wave, which
+    // is 2 * multi_processor_count_
+    int const max_sk_tiles = 2 * multi_processor_count_;
+
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L939
+    // The above line says uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    // That means sk_units is at most ctas_per_sk_wave, which is multi_processor_count_
+    int const max_sk_units = multi_processor_count_;
+
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L505
+    // The above lines scales sk_tiles by the factor of static_cast<uint32_t>(sk_units / sk_tiles +
+    // 2) That means the final sk_tiles is at most 2 * max_sk_tiles + max_sk_units;
+    int const max_sk_tiles_with_seperate_reduction = 2 * max_sk_tiles + max_sk_units;
+
+    return static_cast<size_t>(max_sk_tiles_with_seperate_reduction * MAX_M_TILE_SM90 *
+                               MAX_N_TILE_SM90 * sizeof(float));
+  }
+  // These are the min tile sizes for each config, which would launch the maximum number of blocks
+  int const max_grid_m = cutlass::ceil_div(m, MIN_M_TILE);
+  int const max_grid_n = cutlass::ceil_div(n, MIN_N_TILE);
+  // We need 4 bytes per block in the worst case. We launch split_k_limit in z dim.
+  return static_cast<size_t>(max_grid_m * max_grid_n * SPLIT_K_LIMIT * 4);
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
new file mode 100644
index 000000000..d3219b105
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cute/numeric/integral_constant.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+using namespace cute;
+
+// This filters out invalid template combinations that we DON'T want instantiated in CUTLASS. For
+// example, instantiating SM=75, Stages=3 is invalid so we would need to filter that out. Fine
+// grained quanitzation is only supported on Ampere+ GPUs.
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType>
+void sm90_dispatch_epilogue_schedules(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  switch (gemm_config.epilogue_schedule) {
+    case tkc::EpilogueScheduleType::AUTO:
+      using EpilogueScheduleType =
+          cute::conditional_t<size<0>(CTAShape{}) == Int<64>{},
+                              cutlass::epilogue::TmaWarpSpecialized,
+                              cutlass::epilogue::TmaWarpSpecializedCooperative>;
+      sm90_generic_mixed_gemm_kernelLauncher<
+          ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, QuantOp, EpilogueTag,
+          CTAShape, ClusterShape, MainloopScheduleType, EpilogueScheduleType>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule "
+          "config is invalid for "
+          "mixed "
+          "type GEMM.");
+      break;
+  }
+}
+
+/*
+    1x1x1 cluster shape is are supported for any tile shape.
+
+    2x1x1 cluster shape is only supported for when the M tile is at least 128.
+
+    1x2x1 cluster shape is only supported when the N tile is at least 128.
+
+    2x2x1 cluster shape is only supported when both the M and N tiles are at least 128.
+
+    We make the above restrictions to improve compilation speed in TRT-LLM, by pruning kernels
+    that may not be very useful in practice.
+ */
+template <typename CTAShape, typename ClusterShape>
+constexpr bool are_tile_shapes_supported() {
+  [[maybe_unused]] constexpr int cta_m = get<0>(CTAShape{});
+  [[maybe_unused]] constexpr int cta_n = get<1>(CTAShape{});
+  constexpr int cga_m = get<0>(ClusterShape{});
+  constexpr int cga_n = get<1>(ClusterShape{});
+
+  if constexpr (cga_m == _1{} && cga_n == _1{}) {
+    return true;
+  } else if constexpr (cga_m == _2{} && cga_n == _1{} && cta_m >= _128{}) {
+    return true;
+  } else if constexpr (cga_m == _1{} && cga_n == _2{} && cta_n >= _128{}) {
+    return true;
+  } else if constexpr (cga_m == _2{} && cga_n == _2{} && cta_m >= _128{} && cta_n >= _128{}) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape>
+void sm90_dispatch_mainloop_schedules(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+  constexpr bool tile_shapes_supported = are_tile_shapes_supported<CTAShape, ClusterShape>();
+
+  if constexpr (tile_shapes_supported) {
+    switch (gemm_config.mainloop_schedule) {
+      case tkc::MainloopScheduleType::AUTO:
+        using KernelScheduleType =
+            cute::conditional_t<size<0>(CTAShape{}) == Int<64>{},
+                                cutlass::gemm::KernelTmaWarpSpecializedPingpong,
+                                cutlass::gemm::KernelTmaWarpSpecializedCooperative>;
+        sm90_dispatch_epilogue_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                         OutputType, QuantOp, EpilogueTag, CTAShape, ClusterShape,
+                                         KernelScheduleType>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      default:
+        throw std::runtime_error(
+            "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule "
+            "config is invalid "
+            "for "
+            "mixed type GEMM.");
+        break;
+    }
+  } else {
+    throw std::runtime_error(
+        "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and "
+        "Cluster shapes for "
+        "mixed type GEMM.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape>
+void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B,
+                               ScaleZeroType const* weight_scales,
+                               ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                               float const alpha, OutputType* C, int m, int n, int k,
+                               int const group_size, tkc::CutlassGemmConfig gemm_config,
+                               char* workspace, size_t workspace_bytes, cudaStream_t stream,
+                               int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  switch (gemm_config.cluster_shape) {
+    case tkc::ClusterShape::ClusterShape_1x1x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_1, _1, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_2x1x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_2, _1, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_1x2x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_1, _2, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_2x2x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_2, _2, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type "
+          "GEMM.");
+      break;
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag>
+void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
+                                   ScaleZeroType const* weight_scales,
+                                   ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                                   float const alpha, OutputType* C, int m, int n, int k,
+                                   int const group_size, char* workspace, size_t workspace_bytes,
+                                   tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
+                                   int* occupancy = nullptr) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+  // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
+  // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
+  // perform the best for mixed type gemms.
+
+  constexpr int Ktile = 128 / sizeof(ActivationType);
+  using _Ktile = Int<Ktile>;
+  switch (gemm_config.tile_config_sm90) {
+    case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _16, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x32x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _32, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x64x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _64, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x128x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _128, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x256x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _256, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x16x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _16, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x32x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _32, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x64x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _64, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x128x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _128, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x256x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _256, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::Undefined:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
+      break;
+    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have "
+          "already been set by "
+          "heuristic.");
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for "
+          "mixed type GEMM.");
+      break;
+  }
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
new file mode 100644
index 000000000..6c2098e3c
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime_api.h>
+
+#include "cutlass_extensions/gemm_configs.h"
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
+          typename EpilogueScheduleType>
+void sm90_generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size,
+    tensorrt_llm::cutlass_extensions::CutlassGemmConfig gemm_config, char* workspace,
+    size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr);
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
new file mode 100644
index 000000000..cc91b4ba6
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp"
+#include "cutlass_extensions/gemm_configs.h"
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+using namespace cute;
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
+          typename EpilogueScheduleType>
+void sm90_generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy) {
+  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+#ifdef COMPILE_HOPPER_TMA_GEMMS
+  using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
+
+  if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<
+                    cutlass::arch::Sm90, CTAShape, ClusterShape, ActivationType>) {
+    using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
+
+    using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter<ScaleZeroType>::type;
+    using CutlassBiasType = typename TllmToCutlassTypeAdapter<BiasType>::type;
+    using CutlassOutputType = typename TllmToCutlassTypeAdapter<OutputType>::type;
+
+    static_assert(std::is_same_v<CutlassActivationType, cutlass::half_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::bfloat16_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::float_e5m2_t>,
+                  "Activation type must be bfloat16, half, FP8");
+
+    static_assert(std::is_same_v<CutlassWeightType, uint8_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::uint4b_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::float_e5m2_t>,
+                  "Weight type must be fp8, uint8_t or uint4_t");
+
+    static_assert(!std::is_same_v<CutlassActivationType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassScaleZeroType, cutlass::half_t>,
+                  "Scale/Zero type must be half for fp8 activation");
+
+    using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+    constexpr int AlignmentA = 128 / cutlass::sizeof_bits<CutlassActivationType>::value;
+
+    using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+    constexpr int AlignmentB = 128 / cutlass::sizeof_bits<CutlassWeightType>::value;
+
+    // This example manually swaps and transposes, so keep transpose of input layouts
+    using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+    using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+    using ElementZero = CutlassScaleZeroType;
+    using ElementScale = CutlassScaleZeroType;
+
+    // C/D matrix configuration. We reuse the C operand for the bias and set the stride for
+    // broadcast.
+    using LayoutBias = cutlass::layout::RowMajor;
+    constexpr int AlignmentBias = 128 / cutlass::sizeof_bits<CutlassBiasType>::value;
+
+    // D matrix configuration
+    using LayoutOutput = cutlass::layout::RowMajor;
+    constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<CutlassOutputType>::value;
+
+    // Core kernel configurations
+    using ElementAccumulator = float;  // Element type for internal accumulation
+    using ElementCompute = float;      // Element type for epilogue computation
+    using ArchTag =
+        cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
+    using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+    using TileShape = CTAShape;                            // Threadblock-level tile size
+    using KernelSchedule = MainloopScheduleType;
+    using EpilogueSchedule = EpilogueScheduleType;
+
+    // Shrink the N dimension to match CTA_N if needed
+    constexpr int epi_tile_M = cute::min(shape<0>(TileShape{}), 128);  // 64 or 128
+    constexpr int epi_tile_N =
+        cute::min(shape<1>(TileShape{}), 32);  // Allow this to be 16 for some small N tiles.
+    using EpilogueTileType = cute::Shape<cute::Int<epi_tile_M>, cute::Int<epi_tile_N>>;
+
+    static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+    static_assert(std::is_same_v<EpilogueTag, tensorrt_llm::cutlass_extensions::EpilogueOpBias>,
+                  "");
+    using EVT_bias_addition = cutlass::epilogue::fusion::Sm90EVT<
+        cutlass::epilogue::fusion::Sm90Compute<cutlass::homogeneous_multiply_add, CutlassOutputType,
+                                               ElementCompute,
+                                               RoundStyle>,                  // alpha * acc + bias
+        cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementAccumulator>,  // alpha
+        cutlass::epilogue::fusion::Sm90AccFetch,                             // acc
+        cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, CutlassBiasType, CutlassBiasType,
+                                                    Stride<_1, _0, _0>,
+                                                    AlignmentBias>  // bias
+        >;
+
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, ElementAccumulator,
+        ElementAccumulator,
+        // Transpose layout of D here since we use the explicit swap + transpose trick
+        // Void C since we don't use it. Prevents smem allocation.
+        void, typename cutlass::layout::LayoutTranspose<LayoutBias>::type, AlignmentBias,
+        CutlassOutputType, typename cutlass::layout::LayoutTranspose<LayoutOutput>::type,
+        AlignmentOutput, EpilogueSchedule, EVT_bias_addition>::CollectiveOp;
+
+    using PackedScaleZero = cute::tuple<CutlassWeightType, ElementScale, ElementZero>;
+    using PackedScale = cute::tuple<CutlassWeightType, ElementScale>;
+    using ElementBCollectiveInfo =
+        std::conditional_t<cutlass::hasZero(QuantOp), PackedScaleZero, PackedScale>;
+
+    // We swap A and B operands to the builder here
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilderInterleaved<
+        ArchTag, OperatorClass, ElementBCollectiveInfo, LayoutB_Transpose, AlignmentB,
+        CutlassActivationType, LayoutA_Transpose, AlignmentA, ElementAccumulator, TileShape,
+        ClusterShape,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+            sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        KernelSchedule>::CollectiveOp;
+
+    using TileScheduler =
+        cute::conditional_t<size<0>(CTAShape{}) == Int<64>{}, cutlass::gemm::PersistentScheduler,
+                            cutlass::gemm::StreamKScheduler>;
+
+    using GemmKernel =
+        cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                             CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+    if (occupancy != nullptr) {
+      *occupancy =
+          tensorrt_llm::cutlass_extensions::compute_occupancy_for_kernel<GemmKernel, true>();
+      return;
+    }
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+    using StrideA = typename GemmKernel::StrideA;
+    using StrideB = typename GemmKernel::StrideB;
+    using StrideC = typename GemmKernel::StrideC;
+    using StrideD = typename GemmKernel::StrideD;
+    using StrideS = typename CollectiveMainloop::StrideScale;
+
+    if (weight_scales == nullptr) {
+      throw std::runtime_error("Weight scales must always be set to a non-null value.");
+    }
+
+    if constexpr (cutlass::isFinegrained(QuantOp)) {
+      int cta_shape_k = cute::size<2>(TileShape{});
+      if (group_size % cta_shape_k != 0) {
+        std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
+        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
+      }
+
+      if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) {
+        if (weight_zero_points != nullptr) {
+          throw std::runtime_error(
+              "Weight zero pointer must be a nullptr for scale only fine grained");
+        }
+      } else if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) {
+        if (weight_zero_points == nullptr) {
+          throw std::runtime_error(
+              "Weight zero pointer must be valid for scale and bias fine grained");
+        }
+      }
+    } else {
+      if (group_size != k) {
+        throw std::runtime_error("Invalid group size for per column scaling kernels.");
+      }
+
+      if (weight_zero_points != nullptr) {
+        throw std::runtime_error("Weight zero-points must be null when running per column scaling");
+      }
+    }
+
+    auto cutlass_scale_k = (k + group_size - 1) / group_size;
+    StrideA stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+    StrideB stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+    StrideD stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1));
+    StrideS stride_S =
+        cutlass::make_cute_packed_stride(StrideS{}, cute::make_shape(n, cutlass_scale_k, 1));
+
+    // Use the output as the bias to avoid making a tma descriptor with a nullptr.
+    auto output_as_bias_type = reinterpret_cast<CutlassBiasType const*>(C);
+
+    typename Gemm::Arguments args{
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        {n, m, k, 1},
+        {reinterpret_cast<CutlassWeightType const*>(B), stride_B,
+         reinterpret_cast<CutlassActivationType const*>(A), stride_A,
+         reinterpret_cast<ElementScale const*>(weight_scales), stride_S, group_size,
+         reinterpret_cast<ElementZero const*>(weight_zero_points)},
+        {{}, output_as_bias_type, stride_D, reinterpret_cast<CutlassOutputType*>(C), stride_D}};
+
+    args.epilogue.thread = {
+        {alpha},                                                                   // alpha args
+        {},                                                                        // accumulator
+        {reinterpret_cast<CutlassBiasType const*>(biases), CutlassBiasType(0.f)},  // bias args
+        {}  // end multiply_add
+    };
+
+    Gemm gemm;
+    if (gemm.get_workspace_size(args) > workspace_bytes) {
+      TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
+    }
+
+    auto can_implement = gemm.can_implement(args);
+    if (can_implement != cutlass::Status::kSuccess) {
+      std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
+                            std::string(cutlassGetStatusString(can_implement));
+      std::cout << err_msg << std::endl;
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+
+    auto init_status = gemm.initialize(args, workspace, stream);
+    if (init_status != cutlass::Status::kSuccess) {
+      std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
+                            std::string(cutlassGetStatusString(init_status));
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+
+    auto run_status = gemm.run(stream);
+    if (run_status != cutlass::Status::kSuccess) {
+      std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
+                            std::string(cutlassGetStatusString(run_status));
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+  } else {
+    std::stringstream ss;
+    ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t)cute::size<0>(CTAShape{})
+       << "," << (int64_t)cute::size<1>(CTAShape{}) << "," << (int64_t)cute::size<2>(CTAShape{})
+       << ") (" << (int64_t)cute::size<0>(ClusterShape{}) << ","
+       << (int64_t)cute::size<1>(ClusterShape{}) << "," << (int64_t)cute::size<2>(ClusterShape{})
+       << ") not compiled with FAST_BUILD.";
+
+    throw std::runtime_error(ss.str());
+  }
+
+#else   // COMPILE_HOPPER_TMA_GEMMS
+  throw std::runtime_error(
+      "[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing "
+      "90-real as an arch "
+      "to build_wheel.py.");
+#endif  // COMPILE_HOPPER_TMA_GEMMS
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index a7b5dac5a..b97e65a08 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -37,8 +37,8 @@
 #include "cutlass/util/tensor_view_io.h"
 #include "cutlass_extensions/compute_occupancy.h"
 #include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
 #include "cutlass_extensions/gemm_configs.h"
-#include "internal_cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
 
 #ifdef __GNUC__  // Check if the compiler is GCC or Clang
 #pragma GCC diagnostic pop
@@ -183,7 +183,6 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   hw_info.device_id = 0;
   hw_info.sm_count = sm_count_;
 
-  assert(group_size == int(inputs.groupwise_quant_group_size));
   if (workspace_size != nullptr) {
     const Args args{
         cutlass::gemm::GemmUniversalMode::kGrouped,
@@ -200,6 +199,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
     return;
   }
 
+  assert(group_size == int(inputs.groupwise_quant_group_size));
   arguments = Args{
       cutlass::gemm::GemmUniversalMode::kGrouped,
       {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr},
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index fcc7a3487..a7d8dfbd4 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -349,10 +349,62 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
   return floatTensor;
 }
 
+at::Tensor mxfp4_dequantize_host(at::Tensor weight, at::Tensor scale, int64_t group_size) {
+  // weight (n, k / 2)
+  // scale (n, k / group_size)
+
+  CHECK_CPU_INPUT(weight, FLOAT4_E2M1X2);
+  CHECK_CPU_INPUT(scale, SF_DTYPE);
+  TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+  TORCH_CHECK(scale.is_contiguous(), "scale must be contiguous");
+  TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor");
+  TORCH_CHECK(weight.dtype() == at::ScalarType::Byte, "Weight must be a packed int8 tensor");
+  TORCH_CHECK(scale.dtype() == at::ScalarType::Byte, "Scale must be a int8 tensor");
+
+  TORCH_CHECK(weight.size(0) == scale.size(0),
+              "weight and scale must have the same number of rows");
+  TORCH_CHECK(weight.size(1) * 2 == scale.size(1) * group_size,
+              "weight and scale must have the same number of columns");
+
+  uint8_t* weight_packed_ptr = weight.data_ptr<uint8_t>();
+  __nv_fp8_e8m0* scale_ptr = reinterpret_cast<__nv_fp8_e8m0*>(scale.data_ptr<uint8_t>());
+
+  int const n = weight.size(0);
+  int const k = weight.size(1) * 2;
+
+  at::Tensor dequant_weight =
+      at::empty({n, k}, at::dtype(at::ScalarType::Float).device(at::kCPU).requires_grad(false));
+  float* dequant_weight_ptr = dequant_weight.data_ptr<float>();
+
+  float fp4_lut[] = {0.0, 0.5,  1.0,  1.5,  2.0,  3.0,  4.0,  6.0,
+                     0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0};
+
+  for (int packed_idx = 0; packed_idx < weight.numel(); ++packed_idx) {
+    int8_t weight_packed_data = weight_packed_ptr[packed_idx];
+
+    uint8_t weight_low_ = weight_packed_data & 0xF;
+    uint8_t weight_high_ = (weight_packed_data & 0xF0) >> 4;
+
+    float weight_low = fp4_lut[weight_low_];
+    float weight_high = fp4_lut[weight_high_];
+
+    int scale_n_idx = packed_idx / (k / 2);
+    int scale_k_idx = ((packed_idx * 2) % k) / group_size;
+
+    float scale_ = static_cast<float>(scale_ptr[scale_n_idx * scale.size(1) + scale_k_idx]);
+
+    dequant_weight_ptr[2 * packed_idx] = weight_low * scale_;
+    dequant_weight_ptr[2 * packed_idx + 1] = weight_high * scale_;
+  }
+
+  return dequant_weight;
+}
+
 }  // namespace torch_ext
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("block_scale_interleave", &torch_ext::BlockScaleInterleave);
   m.def("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse);
   m.def("e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
+  m.def("mxfp4_dequantize_host", &torch_ext::mxfp4_dequantize_host);
 }
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index 5249a0206..85f6dd463 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -53,6 +53,7 @@
     block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
+    mxfp4_dequantize_host,
     mxfp4_dequantize,
     mxfp4_quantize,
     nvfp4_quantize,
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 3cb590338..d68d4a308 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -29,6 +29,7 @@
     get_shuffle_matrix_sf_a_row_indices,
     register_custom_op,
     register_fake_op,
+    get_device_arch,
 )
 
 
@@ -61,18 +62,21 @@ def _pad_scale_factors(
         ).contiguous()
 
 
-@functools.cache
-def get_device_arch():
-    major, minor = torch.cuda.get_device_capability()
-    suffix = "a" if major >= 9 else ""
-    return f"{major * 10 + minor}{suffix}"
-
-
 def gen_fp4_quantization_module() -> JitSpec:
+    nvcc_flags = [
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+    ]
+
     if get_device_arch() == "100a":
-        nvcc_flags = sm100a_nvcc_flags
+        nvcc_flags += sm100a_nvcc_flags
+    elif get_device_arch() == "90a":
+        nvcc_flags += sm90a_nvcc_flags
     else:
-        nvcc_flags = sm90a_nvcc_flags
+        raise NotImplementedError(
+            f"Unsupported device architecture: {torch.cuda.get_device_capability()}"
+        )
 
     return gen_jit_spec(
         "fp4_quantization",
@@ -86,14 +90,11 @@ def gen_fp4_quantization_module() -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
         ],
-        extra_cuda_cflags=nvcc_flags
-        + [
-            "-DENABLE_BF16",
-            "-DENABLE_FP8",
-        ],
+        extra_cuda_cflags=nvcc_flags,
         extra_cflags=[
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_include_paths=[
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
@@ -156,6 +157,31 @@ def _fake_fp4_quantize(
             input.new_empty([m * k // sf_vec_size], dtype=torch.int32),  # Scale factors
         )
 
+    @register_custom_op(
+        "flashinfer::mxfp4_dequantize_host",
+        mutates_args=(""),
+    )
+    def mxfp4_dequantize_host(
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        group_size: int = 32,
+    ) -> torch.Tensor:
+        return module.mxfp4_dequantize_host(
+            weight,
+            scale,
+            group_size,
+        )
+
+    @register_fake_op("flashinfer::mxfp4_dequantize_host")
+    def _fake_mxfp4_dequantize_host(
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        group_size: int = 32,
+    ) -> torch.Tensor:
+        return weight.new_empty(
+            [weight.shape[0], weight.shape[1] * 2], dtype=torch.float32
+        )
+
     @register_custom_op(
         "flashinfer::block_scale_interleave",
         mutates_args=("",),
@@ -238,6 +264,7 @@ def _fake_e2m1_and_ufp8sf_scale_to_float(
         fp4_quantize=fp4_quantize,
         block_scale_interleave=block_scale_interleave,
         e2m1_and_ufp8sf_scale_to_float=e2m1_and_ufp8sf_scale_to_float,
+        mxfp4_dequantize_host=mxfp4_dequantize_host,
     )
 
 
@@ -273,6 +300,13 @@ def fp4_quantize(
             - FP8 input when FP8 is not enabled
             - sf_vec_size other than 16 or 32
     """
+
+    # check to make sure device is supported
+    if get_device_arch() != "100a":
+        raise NotImplementedError(
+            f"Unsupported device architecture: {get_device_arch()}"
+        )
+
     if sf_vec_size != 16 and sf_vec_size != 32:
         raise NotImplementedError("sf_vec_size can only be 16 or 32")
 
@@ -313,6 +347,11 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     Raises:
         AssertionError: If input dtype is not uint8.
     """
+    if get_device_arch() != "100a":
+        raise NotImplementedError(
+            f"Unsupported device architecture: {get_device_arch()}"
+        )
+
     # TODO(shuw): check input dtype is uint8
     assert unswizzled_sf.dtype == torch.uint8, (
         f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
@@ -347,6 +386,10 @@ def e2m1_and_ufp8sf_scale_to_float(
         torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
 
     """
+    if get_device_arch() != "100a":
+        raise NotImplementedError(
+            f"Unsupported device architecture: {get_device_arch()}"
+        )
 
     return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float(
         e2m1_tensor,
@@ -419,6 +462,12 @@ def nvfp4_quantize(
             - Quantized tensor of shape [M, K/2] with dtype FLOAT4_E2M1X2
             - Scale factors tensor with shape determined by layout and sf_vec_size
     """
+
+    if get_device_arch() != "100a":
+        raise NotImplementedError(
+            f"Unsupported device architecture: {get_device_arch()}"
+        )
+
     if do_shuffle:
         # Weights 128x4 + shuffle. It is done during the model load and we do not care much about the perf
         assert sfLayout == SfLayout.layout_128x4
@@ -466,3 +515,15 @@ def mxfp4_dequantize(a_fp4, a_sf):
         0,
         True,
     )
+
+
+def mxfp4_dequantize_host(
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    group_size: int = 32,
+) -> torch.Tensor:
+    return get_fp4_quantization_module().mxfp4_dequantize_host(
+        weight,
+        scale,
+        group_size,
+    )
diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
index bea4248b4..736699b46 100644
--- a/flashinfer/fp8_quantization.py
+++ b/flashinfer/fp8_quantization.py
@@ -7,7 +7,7 @@
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_jit_spec, sm100a_nvcc_flags
-from .utils import register_custom_op, register_fake_op
+from .utils import register_custom_op, register_fake_op, get_device_arch
 
 
 def gen_mxfp8_quantization_sm100_module() -> JitSpec:
@@ -26,10 +26,12 @@ def gen_mxfp8_quantization_sm100_module() -> JitSpec:
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_cflags=[
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_include_paths=[
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
@@ -145,6 +147,11 @@ def mxfp8_quantize(
             - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
             - Scale factors tensor with shape determined by layout and sf_vec_size
     """
+    if get_device_arch() != "100a":
+        raise NotImplementedError(
+            f"Unsupported device architecture: {get_device_arch()}"
+        )
+
     sf_vec_size = 32
 
     assert input.shape[-1] % sf_vec_size == 0
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index be38ae807..48d14929f 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -170,70 +170,31 @@ def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Te
     return input_tensor.view(M, K // blockK, blockK).permute(1, 0, 2).contiguous()
 
 
-def get_device_arch():
-    major, minor = torch.cuda.get_device_capability()
-    suffix = "a" if major >= 9 else ""
-    return f"{major * 10 + minor}{suffix}"
-
-
 def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
     output_dir = (
         jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
     )
 
-    print(f"get_device_arch(): {get_device_arch()}")
-    if get_device_arch() == "100a":
-        nvcc_flags = sm100a_nvcc_flags
-    else:
-        nvcc_flags = sm90a_nvcc_flags
-
-    required_kernels_sm100 = [
-        # M128 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group3.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group4.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group5.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group6.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group7.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group8.generated.cu",
-        # M256 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group3.generated.cu",
-        # M64 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group3.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group4.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group5.generated.cu",
-    ]
-    required_kernels_sm80 = [
-        # M128 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M128_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M128_group1.generated.cu",
-        # M16 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group2.generated.cu",
-        # M32 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M32_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M32_group1.generated.cu",
-        # M64 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M64_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M64_group1.generated.cu",
+    # Deive Arch
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+
+    nvcc_flags = [
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
     ]
-    group_gemm_sm100_dir = output_dir / "gemm_grouped/100"
-    group_gemm_sm80_dir = output_dir / "gemm_grouped/80"
+    if device_arch == "100":
+        nvcc_flags += sm100a_nvcc_flags + [
+            "-DCOMPILE_BLACKWELL_TMA_GEMMS",
+            "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
+        ]
+    else:
+        nvcc_flags += sm90a_nvcc_flags + [
+            "-DCOMPILE_HOPPER_TMA_GEMMS",
+            "-DCOMPILE_HOPPER_TMA_GROUPED_GEMMS",
+        ]
 
     try:
         # Create output directory if it doesn't exist
@@ -241,7 +202,7 @@ def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
 
         generate_gemm_operations(
             output_dir,
-            "100;100-real",
+            f"{device_arch};{device_arch}-real",
         )
 
     except Exception as e:
@@ -287,8 +248,7 @@ def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu",
             # Add all generated kernels
-            *(group_gemm_sm100_dir / kernel for kernel in required_kernels_sm100),
-            *(group_gemm_sm80_dir / kernel for kernel in required_kernels_sm80),
+            *(output_dir / kernel for kernel in output_dir.rglob("*.generated.cu")),
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/envUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/logger.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
@@ -301,16 +261,7 @@ def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/lora/lora.cpp",
         ],
-        extra_cuda_cflags=nvcc_flags
-        + [
-            "-DENABLE_BF16",
-            "-DENABLE_FP8",
-            "-DENABLE_FP4",
-            "-DCOMPILE_BLACKWELL_TMA_GEMMS",
-            "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
-            "-DCOMPILE_HOPPER_TMA_GEMMS",
-            "-DUSING_OSS_CUTLASS_MOE_GEMM",
-        ],
+        extra_cuda_cflags=nvcc_flags,
         extra_cflags=["-DFAST_BUILD"] if use_fast_build else [],
         extra_ldflags=["-lcuda"],
         extra_include_paths=[
@@ -372,7 +323,7 @@ def __init__(
             cluster_rank: int,
             enable_alltoall: bool,
             use_deepseek_fp8_block_scale: bool,
-            use_w4a8_group_scaling: bool,
+            use_w4_group_scaling: bool,
             use_mxfp8_act_scaling: bool,
             min_latency_mode: bool,
         ):
@@ -388,7 +339,7 @@ def __init__(
             self.cluster_rank = cluster_rank
             self.enable_alltoall = enable_alltoall
             self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
-            self.use_w4a8_group_scaling = use_w4a8_group_scaling
+            self.use_w4_group_scaling = use_w4_group_scaling
             self.use_mxfp8_act_scaling = use_mxfp8_act_scaling
             self.min_latency_mode = min_latency_mode
             instance_key = (
@@ -396,18 +347,18 @@ def __init__(
                 weight_dtype,
                 output_dtype,
                 use_deepseek_fp8_block_scale,
-                use_w4a8_group_scaling,
+                use_w4_group_scaling,
                 use_mxfp8_act_scaling,
             )
 
             if instance_key not in MoERunner.runner_dict:
                 MoERunner.runner_dict[instance_key] = (
-                    torch.classes.fused_moe_sm100.FusedMoeRunner(
+                    torch.classes.fused_moe.FusedMoeRunner(
                         x_dtype,
                         weight_dtype,
                         output_dtype,
                         use_deepseek_fp8_block_scale,
-                        use_w4a8_group_scaling,
+                        use_w4_group_scaling,
                         use_mxfp8_act_scaling,
                     )
                 )
@@ -496,7 +447,7 @@ def cutlass_fused_moe(
         cluster_rank: int = 0,
         enable_alltoall: bool = False,
         use_deepseek_fp8_block_scale: bool = False,
-        use_w4a8_group_scaling: bool = False,
+        use_w4_group_scaling: bool = False,
         use_mxfp8_act_scaling: bool = False,
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
@@ -518,7 +469,7 @@ def cutlass_fused_moe(
             cluster_rank=cluster_rank,
             enable_alltoall=enable_alltoall,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-            use_w4a8_group_scaling=use_w4a8_group_scaling,
+            use_w4_group_scaling=use_w4_group_scaling,
             use_mxfp8_act_scaling=use_mxfp8_act_scaling,
             min_latency_mode=min_latency_mode,
         )
@@ -607,7 +558,7 @@ def _fake_cutlass_fused_moe(
         cluster_rank: int = 0,
         enable_alltoall: bool = False,
         use_deepseek_fp8_block_scale: bool = False,
-        use_w4a8_group_scaling: bool = False,
+        use_w4_group_scaling: bool = False,
         use_mxfp8_act_scaling: bool = False,
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
@@ -659,7 +610,7 @@ def cutlass_fused_moe(
     output: Optional[torch.Tensor] = None,
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
-    use_w4a8_group_scaling: bool = False,
+    use_w4_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
     tune_max_num_tokens: int = 8192,
@@ -746,7 +697,7 @@ def cutlass_fused_moe(
     use_deepseek_fp8_block_scale : bool = False
         Whether to use FP8 block scaling. Defaults to False.
 
-    use_w4a8_group_scaling : bool = False
+    use_w4_group_scaling : bool = False
         Whether to use W4A8 group scaling. Defaults to False.
 
     use_mxfp8_act_scaling : bool = False
@@ -783,10 +734,6 @@ def cutlass_fused_moe(
         raise NotImplementedError(
             "DeepSeek FP8 Block Scaling is not yet implemented in CUTLASS for Blackwell."
         )
-    if use_w4a8_group_scaling:
-        raise NotImplementedError(
-            "W4A8 Group Scaling is not yet implemented for Blackwell."
-        )
     if min_latency_mode:
         raise NotImplementedError("min latency mode not yet implemented for Blackwell.")
 
@@ -826,7 +773,7 @@ def cutlass_fused_moe(
         cluster_rank,
         enable_alltoall=enable_alltoall,
         use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        use_w4a8_group_scaling=use_w4a8_group_scaling,
+        use_w4_group_scaling=use_w4_group_scaling,
         use_mxfp8_act_scaling=use_mxfp8_act_scaling,
         min_latency_mode=min_latency_mode,
         tune_max_num_tokens=tune_max_num_tokens,
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
index 7b1e1a110..16ab5f0a7 100644
--- a/flashinfer/jit/core.py
+++ b/flashinfer/jit/core.py
@@ -59,12 +59,12 @@ def clear_cache_dir():
         shutil.rmtree(jit_env.FLASHINFER_JIT_DIR)
 
 
-sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"]
-sm100a_nvcc_flags = [
-    "-gencode=arch=compute_100a,code=sm_100a",
+common_nvcc_flags = [
     "-DFLASHINFER_ENABLE_FP8_E8M0",
     "-DFLASHINFER_ENABLE_FP4_E2M1",
 ]
+sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"] + common_nvcc_flags
+sm100a_nvcc_flags = ["-gencode=arch=compute_100a,code=sm_100a"] + common_nvcc_flags
 
 
 @dataclasses.dataclass
diff --git a/flashinfer/jit/cutlass_gemm/generate_kernels.py b/flashinfer/jit/cutlass_gemm/generate_kernels.py
index c6133d36a..8215042c9 100644
--- a/flashinfer/jit/cutlass_gemm/generate_kernels.py
+++ b/flashinfer/jit/cutlass_gemm/generate_kernels.py
@@ -1,6 +1,6 @@
 import enum
 import os
-from itertools import product
+from itertools import chain, product
 
 from .cutlass_library import *
 
@@ -101,6 +101,9 @@ def GetDataTypeNames(type, is_mx_fpx=None):
     DataType.bf16: "__nv_bfloat16",
     DataType.f16: "half",
     DataType.f32: "float",
+    DataType.e2m1: "__nv_fp4_e2m1",
+    DataType.ue8m0: "cutlass::float_ue8m0_t",
+    DataType.u4: "cutlass::uint4b_t",
 }
 
 
@@ -217,7 +220,7 @@ def instantiate_operation_tma_warp_specialized(operation):
             operation.act_type != DataType.e4m3 or operation.weight_type != e2m1
         ):
             # Mixed MoE GEMM
-            weight_tag = DataTypeTag[operation.weight_type]
+            weight_tag = CudaTypeName[operation.weight_type]
             instantiation = f"""
 template void sm90_generic_mixed_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {out_tag},
 {epi_tag}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}, {quant_op}> (
@@ -577,10 +580,16 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     if not is_arch_enabled:
         return []
     arch = 90
-    supported_dtypes = [
+
+    # act_type, weight_type, scalezero_type, bias_type, output_type
+    supported_dtypes_int4 = [
         (DataType.e4m3, DataType.u4, DataType.f16, DataType.f16, DataType.f16),
         (DataType.e4m3, DataType.u4, DataType.bf16, DataType.bf16, DataType.bf16),
     ]
+    supported_dtypes_fp4 = [
+        (DataType.f16, DataType.e2m1, DataType.ue8m0, DataType.f16, DataType.f16),
+        (DataType.bf16, DataType.e2m1, DataType.ue8m0, DataType.bf16, DataType.bf16),
+    ]
 
     quant_ops = [TrtLlm_QuantOp.finegrained_scale_only]
 
@@ -589,16 +598,26 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
     N_TILES = [16, 32, 64, 128]
     K_TILES = [128, 256, 512]
-    cta_shapes_mnk = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_int4 = list(product(M_TILES, N_TILES, K_TILES))
+
+    M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
+    N_TILES = [16, 32, 64]
+    K_TILES = [128, 256]
+    cta_shapes_mnk_fp4 = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_fp4.append((128, 128, 128))
 
     warp_shape = [0, 0, 0]  # ignored except for naming
     stages = 0  # auto
 
-    cga_shapes = product([1, 2], [1, 2], [1])
+    cga_shapes = list(product([1, 2], [1, 2], [1]))
 
-    partial_args = product(
-        supported_dtypes, quant_ops, epi_tags, cta_shapes_mnk, cga_shapes
+    partial_args_int4 = product(
+        supported_dtypes_int4, quant_ops, epi_tags, cta_shapes_mnk_int4, cga_shapes
+    )
+    partial_args_fp4 = product(
+        supported_dtypes_fp4, quant_ops, epi_tags, cta_shapes_mnk_fp4, cga_shapes
     )
+    partial_args = chain(partial_args_int4, partial_args_fp4)
 
     operations = list()
     for dtype_combo, quant_op, epi_tag, cta_shape_mnk, cga_shape in partial_args:
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
index ccb0579c5..3454e8131 100644
--- a/flashinfer/utils.py
+++ b/flashinfer/utils.py
@@ -417,6 +417,13 @@ def version_at_least(version: str, base_version: str) -> bool:
     return pkg_version.parse(version) >= pkg_version.parse(base_version)
 
 
+@functools.cache
+def get_device_arch():
+    major, minor = torch.cuda.get_device_capability()
+    suffix = "a" if major >= 9 else ""
+    return f"{major * 10 + minor}{suffix}"
+
+
 def is_sm90a_supported(device: torch.device) -> bool:
     major, _ = get_compute_capability(device)
     return major == 9 and version_at_least(torch.version.cuda, "12.3")
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
index 6a240d771..a3eb8b362 100644
--- a/tests/test_fp4_quantize.py
+++ b/tests/test_fp4_quantize.py
@@ -8,6 +8,8 @@
     block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
+    mxfp4_quantize,
+    mxfp4_dequantize,
 )
 from flashinfer.utils import is_sm100a_supported
 
@@ -276,5 +278,19 @@ def test_e2m1_dequantization(
     )
 
 
+def test_mxfp4_quantize_roundtrip():
+    x = torch.randn((128, 64), device="cuda", dtype=torch.bfloat16) / 10
+
+    quant_a, sfs = mxfp4_quantize(x)
+    dq_a = mxfp4_dequantize(quant_a, sfs)
+
+    print("x: ", x)
+    print("dq_a: ", dq_a)
+
+    torch.testing.assert_close(
+        dq_a, x, rtol=0.3, atol=0.5, msg="Quantize -> dequantize mxfp4 roundtrip failed"
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_groupwise_scaled_gemm_mxfp4.py b/tests/test_groupwise_scaled_gemm_mxfp4.py
index c3decd905..2785dda2a 100644
--- a/tests/test_groupwise_scaled_gemm_mxfp4.py
+++ b/tests/test_groupwise_scaled_gemm_mxfp4.py
@@ -24,7 +24,7 @@
 
 from flashinfer.fp4_quantization import (
     _pad_scale_factors,
-    get_fp4_quantization_sm100_module,
+    get_fp4_quantization_module,
 )
 from flashinfer.gemm import group_gemm_mxfp4_nt_groupwise
 
@@ -64,7 +64,7 @@ def swizzle_blockscale(
         _pad_scale_factors(unswizzled_sf[i], m, n, sf_vec_size) for i in range(b)
     ]
     padded_input_sf = torch.stack(padded_input_sf_chunked)
-    out = get_fp4_quantization_sm100_module().nvfp4_block_scale_interleave_sm100(
+    out = get_fp4_quantization_module().nvfp4_block_scale_interleave_sm100(
         padded_input_sf
     )
     out = out.view(padded_input_sf.shape)
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index 72f593105..24b88fa87 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -17,6 +17,7 @@
 import pytest
 import torch
 from torch.nn import functional as F
+from flashinfer.utils import get_device_arch
 
 import flashinfer.fused_moe as fused_moe
 from flashinfer import (
@@ -25,6 +26,7 @@
     mxfp4_quantize,
     mxfp8_dequantize_host,
     mxfp8_quantize,
+    mxfp4_dequantize_host,
 )
 
 FLOAT4_E2M1_MAX = 6.0
@@ -360,6 +362,9 @@ def test_moe_fp8(
     [(torch.float16, torch.float8_e4m3fn), (torch.bfloat16, torch.float8_e4m3fn)],
 )
 @pytest.mark.parametrize("quantized_input", [False, True])
+@pytest.mark.skipif(
+    get_device_arch() != "100a", reason="NVFP4 is only supported on SM100a"
+)
 def test_moe_nvfp4(
     batch_size,
     hidden_size,
@@ -933,6 +938,9 @@ def transform_dim(a: torch.Tensor, dim: int = -1) -> torch.Tensor:
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.parametrize("top_k", TOP_K_VALUES)
 @pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.skipif(
+    get_device_arch() != "100a", reason="FP8 block scaling is only supported on SM100a"
+)
 def test_moe_fp8_block_scaling(
     batch_size, hidden_size, num_experts, top_k, intermediate_size
 ):
@@ -1083,6 +1091,9 @@ def dequant_mxfp4_batches(
 @pytest.mark.parametrize(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
+@pytest.mark.skipif(
+    get_device_arch() != "100a", reason="MXFP8xMXFP4 is only supported on SM100a"
+)
 def test_moe_mxfp8_mxfp4(
     batch_size,
     hidden_size,
@@ -1203,6 +1214,18 @@ def test_moe_mxfp8_mxfp4(
     torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
 
 
+def dequant_mxfp4_batches_host(
+    mat_fp4: torch.Tensor,
+    scale_tensor: torch.Tensor,
+):
+    return torch.stack(
+        [
+            mxfp4_dequantize_host(mat_fp4[b, :, :], scale_tensor[b, :, :])
+            for b in range(mat_fp4.size(0))
+        ]
+    )
+
+
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@@ -1211,6 +1234,9 @@ def test_moe_mxfp8_mxfp4(
 @pytest.mark.parametrize(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
+@pytest.mark.skipif(
+    get_device_arch() != "90a", reason="BF16xMXFP4 is only supported on SM90a"
+)
 def test_moe_bf16_mxfp4(
     batch_size,
     hidden_size,
@@ -1238,24 +1264,19 @@ def test_moe_bf16_mxfp4(
     k = hidden_size
 
     x = torch.randn(m, k, dtype=torch.bfloat16).cuda()
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10
+    w1 = torch.randint(0, 256, (e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
+    w2 = torch.randint(0, 256, (e, k, n // 2), device="cuda", dtype=torch.uint8)
 
-    mxfp4_w1, mxfp4_w1_scale = quant_mxfp4_batches(w1, e)
-    mxfp4_w2, mxfp4_w2_scale = quant_mxfp4_batches(w2, e)
+    w1_scale = torch.randint(
+        118, 123, (e, 2 * n, k // 32), device="cuda", dtype=torch.uint8
+    )
+    w2_scale = torch.randint(
+        118, 123, (e, k, n // 32), device="cuda", dtype=torch.uint8
+    )
 
     router_logits = torch.randn(m, e, dtype=torch.bfloat16).cuda()
     routing_weights, selected_experts = compute_routing(router_logits, top_k)
 
-    fake_input_scale = torch.ones(e, device=x.device)
-
-    quant_scales = [
-        mxfp4_w1_scale.view(torch.int32),
-        fake_input_scale,
-        mxfp4_w2_scale.view(torch.int32),
-        fake_input_scale,
-    ]
-
     flash_output = torch.zeros_like(x)
 
     if alpha is not None and limit is not None and beta is not None:
@@ -1267,34 +1288,43 @@ def test_moe_bf16_mxfp4(
         limit_t = None
         beta_t = None
 
-    # Call cutlass_fused_moe with MXFP8 activations and MXFP4 weights
+    pad_size = hidden_size - x.shape[1]
+    x_pad = torch.nn.functional.pad(x, (0, pad_size))
+
+    quant_scales = [
+        w1_scale.view(torch.int32),
+        w2_scale.view(torch.int32),
+    ]
+
+    # Call cutlass_fused_moe with BF16 activations and MXFP4 weights
     _ = fused_moe.cutlass_fused_moe(
-        x,
+        x_pad,
         selected_experts.to(torch.int),
         routing_weights,
-        mxfp4_w1.contiguous().view(torch.long),
-        mxfp4_w2.contiguous().view(torch.long),
+        w1.contiguous().view(torch.uint8),
+        w2.contiguous().view(torch.uint8),
         torch.bfloat16,
         swiglu_alpha=alpha_t,
         swiglu_limit=limit_t,
         swiglu_beta=beta_t,
         quant_scales=quant_scales,
+        use_w4_group_scaling=True,
         output=flash_output,
     )
 
     dq_mfxp4_w1 = (
-        dequant_mxfp4_batches(
-            mxfp4_w1.cpu().view(torch.uint8),
-            mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
+        dequant_mxfp4_batches_host(
+            w1.cpu(),
+            w1_scale.cpu(),
         )
         .cuda()
         .to(torch.bfloat16)
     )
 
     dq_mfxp4_w2 = (
-        dequant_mxfp4_batches(
-            mxfp4_w2.cpu().view(torch.uint8),
-            mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
+        dequant_mxfp4_batches_host(
+            w2.cpu(),
+            w2_scale.cpu(),
         )
         .cuda()
         .to(torch.bfloat16)
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py
index f82b61cce..5d30e911f 100644
--- a/tests/test_trtllm_gen_fused_moe.py
+++ b/tests/test_trtllm_gen_fused_moe.py
@@ -31,7 +31,7 @@
     reorder_rows_for_gated_act_gemm,
     shuffle_matrix_a,
 )
-from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
+from flashinfer.fp4_quantization import block_scale_interleave
 from flashinfer.fused_moe import (
     WeightLayout,
     convert_to_block_layout,
@@ -418,7 +418,7 @@ def prepare_static_weights_for_kernel(
                 num_elts_per_sf=16,
             )
             gemm1_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
+                block_scale_interleave(
                     gemm1_scales_linear_fp4[i]
                     .view(torch.uint8)[
                         permute_sf_indices.to(gemm1_scales_linear_fp4.device)
@@ -445,7 +445,7 @@ def prepare_static_weights_for_kernel(
                 num_elts_per_sf=16,
             )
             gemm2_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
+                block_scale_interleave(
                     gemm2_scales_linear_fp4[i]
                     .view(torch.uint8)[
                         permute_sf_indices.to(gemm2_scales_linear_fp4.device)

From f6f01c4ac5538286e8a5533ce63ae8d32a0c97ea Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Thu, 14 Aug 2025 10:03:51 -0700
Subject: [PATCH 06/12] addressing PR comments

---
 flashinfer/__init__.py           |  1 +
 flashinfer/aot.py                | 10 +++--
 flashinfer/fp4_quantization.py   | 60 ++++++++++++++++++--------
 flashinfer/fused_moe/__init__.py |  6 ++-
 flashinfer/fused_moe/core.py     | 73 ++++++++++++++++++++++----------
 5 files changed, 105 insertions(+), 45 deletions(-)

diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index 85f6dd463..2739740b0 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -51,6 +51,7 @@
 from .fp4_quantization import (
     SfLayout,
     block_scale_interleave,
+    nvfp4_block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
     mxfp4_dequantize_host,
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
index c4e0e6fb7..2f0e5dbef 100644
--- a/flashinfer/aot.py
+++ b/flashinfer/aot.py
@@ -13,7 +13,10 @@
 from .cascade import gen_cascade_module
 from .comm.nvshmem import gen_nvshmem_module
 from .fp4_quantization import gen_fp4_quantization_module
-from .fused_moe import gen_cutlass_fused_moe_module
+from .fused_moe import (
+    gen_cutlass_fused_moe_sm100_module,
+    gen_cutlass_fused_moe_sm90_module,
+)
 from .gemm import gen_gemm_module, gen_gemm_sm90_module, gen_gemm_sm100_module
 from .jit import JitSpec, build_jit_specs
 from .jit import env as jit_env
@@ -363,11 +366,12 @@ def gen_all_modules(
 
     if add_moe:
         jit_specs.append(gen_gemm_module())
+        jit_specs.append(gen_fp4_quantization_module())
         if has_sm90:
             jit_specs.append(gen_gemm_sm90_module())
+            jit_specs.append(gen_cutlass_fused_moe_sm90_module())
         if has_sm100:
-            jit_specs.append(gen_cutlass_fused_moe_module())
-            jit_specs.append(gen_fp4_quantization_module())
+            jit_specs.append(gen_cutlass_fused_moe_sm100_module())
             jit_specs.append(gen_gemm_sm100_module())
 
     if add_comm:
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 0d6c5fa60..7f2de3ed6 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -23,7 +23,7 @@
 
 from .jit import JitSpec
 from .jit import env as jit_env
-from .jit import gen_jit_spec, sm100a_nvcc_flags, sm90a_nvcc_flags
+from .jit import gen_jit_spec, sm100a_nvcc_flags
 from .utils import (
     device_support_pdl,
     get_shuffle_matrix_a_row_indices,
@@ -64,21 +64,6 @@ def _pad_scale_factors(
 
 
 def gen_fp4_quantization_module() -> JitSpec:
-    nvcc_flags = [
-        "-DENABLE_BF16",
-        "-DENABLE_FP8",
-        "-DENABLE_FP4",
-    ]
-
-    if get_device_arch() == "100a":
-        nvcc_flags += sm100a_nvcc_flags
-    elif get_device_arch() == "90a":
-        nvcc_flags += sm90a_nvcc_flags
-    else:
-        raise NotImplementedError(
-            f"Unsupported device architecture: {torch.cuda.get_device_capability()}"
-        )
-
     return gen_jit_spec(
         "fp4_quantization",
         [
@@ -91,7 +76,12 @@ def gen_fp4_quantization_module() -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/tllmException.cpp",
         ],
-        extra_cuda_cflags=nvcc_flags,
+        extra_cuda_cflags=sm100a_nvcc_flags
+        + [
+            "-DENABLE_BF16",
+            "-DENABLE_FP8",
+            "-DENABLE_FP4",
+        ],
         extra_cflags=[
             "-DENABLE_BF16",
             "-DENABLE_FP8",
@@ -374,6 +364,10 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     )
 
 
+# Maintain compatibility with libraries using the old name
+nvfp4_block_scale_interleave = block_scale_interleave
+
+
 def e2m1_and_ufp8sf_scale_to_float(
     e2m1_tensor: torch.Tensor,
     ufp8_scale_tensor: torch.Tensor,
@@ -523,12 +517,33 @@ def nvfp4_quantize(
 
 
 def mxfp4_quantize(a):
+    """
+    Quantize input tensor to MXFP4 format.
+
+    Parameters:
+        a (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+            - Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+    """
     a_global_sf = (448 * 6) / a.float().abs().nan_to_num().max()
     a_fp4, a_sf = fp4_quantize(a.cuda(), a_global_sf.cuda(), 32, True, True)
     return a_fp4, a_sf
 
 
 def mxfp4_dequantize(a_fp4, a_sf):
+    """
+    Dequantize input tensor from MXFP4 format.
+
+    Parameters:
+        a_fp4 (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+        a_sf (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+
+    Returns:
+        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
+    """
     return e2m1_and_ufp8sf_scale_to_float(
         a_fp4.cpu().view(torch.uint8),
         a_sf.cpu().view(torch.uint8).reshape(-1),
@@ -544,6 +559,17 @@ def mxfp4_dequantize_host(
     scale: torch.Tensor,
     group_size: int = 32,
 ) -> torch.Tensor:
+    """
+    Dequantize input tensor from MXFP4 format on host.
+
+    Parameters:
+        weight (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+        scale (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+        group_size (int, optional): Group size for dequantization. Defaults to 32.
+
+    Returns:
+        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
+    """
     return get_fp4_quantization_module().mxfp4_dequantize_host(
         weight,
         scale,
diff --git a/flashinfer/fused_moe/__init__.py b/flashinfer/fused_moe/__init__.py
index 1bd23326b..f7319d1fd 100644
--- a/flashinfer/fused_moe/__init__.py
+++ b/flashinfer/fused_moe/__init__.py
@@ -19,7 +19,8 @@
     WeightLayout,
     convert_to_block_layout,
     cutlass_fused_moe,
-    gen_cutlass_fused_moe_module,
+    gen_cutlass_fused_moe_sm100_module,
+    gen_cutlass_fused_moe_sm90_module,
     reorder_rows_for_gated_act_gemm,
     trtllm_fp4_block_scale_moe,
     trtllm_fp4_block_scale_routed_moe,
@@ -32,7 +33,8 @@
     "WeightLayout",
     "convert_to_block_layout",
     "cutlass_fused_moe",
-    "gen_cutlass_fused_moe_module",
+    "gen_cutlass_fused_moe_sm100_module",
+    "gen_cutlass_fused_moe_sm90_module",
     "reorder_rows_for_gated_act_gemm",
     "trtllm_fp4_block_scale_moe",
     "trtllm_fp8_block_scale_moe",
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 5fa10c67a..76f7301cf 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -171,31 +171,39 @@ def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Te
     return input_tensor.view(M, K // blockK, blockK).permute(1, 0, 2).contiguous()
 
 
-def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
-    output_dir = (
-        jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
-    )
+def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
+    nvcc_flags = sm100a_nvcc_flags + [
+        "-DCOMPILE_BLACKWELL_TMA_GEMMS",
+        "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
+    ]
+    return gen_cutlass_fused_moe_module(nvcc_flags, "100", use_fast_build)
 
-    # Deive Arch
-    major, minor = torch.cuda.get_device_capability()
-    device_arch = f"{major * 10 + minor}"
 
-    nvcc_flags = [
+def gen_cutlass_fused_moe_sm90_module(use_fast_build: bool = False) -> JitSpec:
+    nvcc_flags = sm90a_nvcc_flags + [
+        "-DCOMPILE_HOPPER_TMA_GEMMS",
+        "-DCOMPILE_HOPPER_TMA_GROUPED_GEMMS",
         "-DENABLE_BF16",
         "-DENABLE_FP8",
         "-DENABLE_FP4",
         "-DUSING_OSS_CUTLASS_MOE_GEMM",
     ]
-    if device_arch == "100":
-        nvcc_flags += sm100a_nvcc_flags + [
-            "-DCOMPILE_BLACKWELL_TMA_GEMMS",
-            "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
-        ]
-    else:
-        nvcc_flags += sm90a_nvcc_flags + [
-            "-DCOMPILE_HOPPER_TMA_GEMMS",
-            "-DCOMPILE_HOPPER_TMA_GROUPED_GEMMS",
-        ]
+    return gen_cutlass_fused_moe_module(nvcc_flags, "90", use_fast_build)
+
+
+def gen_cutlass_fused_moe_module(
+    nvcc_flags: List[str], device_arch: str, use_fast_build: bool = False
+) -> JitSpec:
+    """
+    Generate a JitSpec for the cutlass fused moe module.
+    """
+    output_dir = (
+        jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
+    )
 
     try:
         # Create output directory if it doesn't exist
@@ -289,10 +297,17 @@ def gen_cutlass_fused_moe_module(use_fast_build: bool = False) -> JitSpec:
 
 
 @functools.cache
-def get_cutlass_fused_moe_module(use_fast_build: bool = False):
-    FusedMoeRunner = gen_cutlass_fused_moe_module(use_fast_build).build_and_load(
-        class_name="FusedMoeRunner"
-    )
+def get_cutlass_fused_moe_module(backend: str = "100", use_fast_build: bool = False):
+    if backend == "100":
+        FusedMoeRunner = gen_cutlass_fused_moe_sm100_module(
+            use_fast_build
+        ).build_and_load(class_name="FusedMoeRunner")
+    elif backend == "90":
+        FusedMoeRunner = gen_cutlass_fused_moe_sm90_module(
+            use_fast_build
+        ).build_and_load(class_name="FusedMoeRunner")
+    else:
+        raise ValueError(f"Invalid backend: {backend}")
 
     class MoERunner(TunableRunner):
         # avoid overhead of creating a new runner in forward pass
@@ -678,6 +693,15 @@ def cutlass_fused_moe(
     input_sf : Optional[torch.Tensor]
         Input scaling factor for quantization.
 
+    swiglu_alpha : Optional[torch.Tensor]
+        Swiglu alpha for swiglu activation.
+
+    swiglu_beta : Optional[torch.Tensor]
+        Swiglu beta for swiglu activation.
+
+    swiglu_limit : Optional[torch.Tensor]
+        Swiglu limit for swiglu activation.
+
     tp_size : int = 1
         Tensor parallelism size. Defaults to 1.
 
@@ -761,7 +785,10 @@ def cutlass_fused_moe(
             output, output_shape, output_dtype, input.device, "output"
         )
 
-    return get_cutlass_fused_moe_module().cutlass_fused_moe(
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+
+    return get_cutlass_fused_moe_module(device_arch).cutlass_fused_moe(
         output,
         input,
         token_selected_experts,

From aad97d5c3b522537f9512dadd3fbe5d37545e91c Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Thu, 14 Aug 2025 10:39:21 -0700
Subject: [PATCH 07/12] address pr comments

---
 csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp  |  6 +-
 .../tensorrt_llm/thop/fp4Quantize.cpp         |  4 +-
 flashinfer/fp4_quantization.py                | 61 ++++++-------------
 tests/test_fp4_quantize.py                    |  9 +--
 4 files changed, 31 insertions(+), 49 deletions(-)

diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index cdb936825..0fbda766e 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -404,8 +404,8 @@ at::Tensor mxfp4_dequantize_host(at::Tensor weight, at::Tensor scale, int64_t gr
 }  // namespace torch_ext
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
-  m.def("block_scale_interleave", &torch_ext::BlockScaleInterleave);
-  m.def("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse);
-  m.def("e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
+  m.def("block_scale_interleave_sm100", &torch_ext::BlockScaleInterleave);
+  m.def("block_scale_interleave_reverse_sm100", &torch_ext::BlockScaleInterleaveReverse);
+  m.def("e2m1_and_ufp8sf_scale_to_float_sm100", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
   m.def("mxfp4_dequantize_host", &torch_ext::mxfp4_dequantize_host);
 }
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
index fb21cf6ac..1de11895f 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -144,4 +144,6 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
 }
 }  // namespace torch_ext
 
-TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) { m.def("fp4_quantize", &torch_ext::fp4_quantize); }
+TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
+  m.def("fp4_quantize_sm100", &torch_ext::fp4_quantize);
+}
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 7f2de3ed6..5984d34d5 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -30,7 +30,6 @@
     get_shuffle_matrix_sf_a_row_indices,
     register_custom_op,
     register_fake_op,
-    get_device_arch,
 )
 
 
@@ -99,10 +98,10 @@ def get_fp4_quantization_module():
     module = gen_fp4_quantization_module().build_and_load()
 
     @register_custom_op(
-        "flashinfer::fp4_quantize",
+        "flashinfer::fp4_quantize_sm100",
         mutates_args=(""),
     )
-    def fp4_quantize(
+    def fp4_quantize_sm100(
         input: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
         sf_vec_size: int = 16,
@@ -130,7 +129,7 @@ def fp4_quantize(
         """
         if enable_pdl is None:
             enable_pdl = device_support_pdl(input.device)
-        return module.fp4_quantize(
+        return module.fp4_quantize_sm100(
             input,
             global_scale,
             sf_vec_size,
@@ -140,7 +139,7 @@ def fp4_quantize(
             enable_pdl,
         )
 
-    @register_fake_op("flashinfer::fp4_quantize")
+    @register_fake_op("flashinfer::fp4_quantize_sm100")
     def _fake_fp4_quantize(
         input: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
@@ -180,10 +179,10 @@ def _fake_mxfp4_dequantize_host(
         )
 
     @register_custom_op(
-        "flashinfer::block_scale_interleave",
+        "flashinfer::block_scale_interleave_sm100",
         mutates_args=("",),
     )
-    def block_scale_interleave(
+    def block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         """Swizzle block scale tensor for FP4 format.
@@ -194,12 +193,12 @@ def block_scale_interleave(
         Returns:
             torch.Tensor: output tensor for swizzled block scale with dtype uint8.
         """
-        return module.block_scale_interleave(
+        return module.block_scale_interleave_sm100(
             unswizzled_sf,
         )
 
-    @register_fake_op("flashinfer::block_scale_interleave")
-    def _fake_block_scale_interleave(
+    @register_fake_op("flashinfer::block_scale_interleave_sm100")
+    def _fake_block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         return unswizzled_sf.new_empty(
@@ -207,10 +206,10 @@ def _fake_block_scale_interleave(
         )
 
     @register_custom_op(
-        "flashinfer::e2m1_and_ufp8sf_scale_to_float",
+        "flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100",
         mutates_args=(""),
     )
-    def e2m1_and_ufp8sf_scale_to_float(
+    def e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_tensor: torch.Tensor,
         ufp8_scale_tensor: torch.Tensor,
         global_scale_tensor: Optional[torch.Tensor] = None,
@@ -234,7 +233,7 @@ def e2m1_and_ufp8sf_scale_to_float(
         Returns:
             torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
         """
-        return module.e2m1_and_ufp8sf_scale_to_float(
+        return module.e2m1_and_ufp8sf_scale_to_float_sm100(
             e2m1_tensor.cpu(),
             ufp8_scale_tensor.cpu().reshape(-1),
             global_scale_tensor.cpu(),
@@ -243,8 +242,8 @@ def e2m1_and_ufp8sf_scale_to_float(
             is_sf_swizzled_layout,
         )
 
-    @register_fake_op("flashinfer::e2m1_and_ufp8sf_scale_to_float")
-    def _fake_e2m1_and_ufp8sf_scale_to_float(
+    @register_fake_op("flashinfer::e2m1_and_ufp8sf_scale_to_float_sm100")
+    def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_tensor: torch.Tensor,
         ufp8_scale_tensor: torch.Tensor,
         global_scale_tensor: Optional[torch.Tensor] = None,
@@ -258,9 +257,9 @@ def _fake_e2m1_and_ufp8sf_scale_to_float(
 
     # Register the module
     return SimpleNamespace(
-        fp4_quantize=fp4_quantize,
-        block_scale_interleave=block_scale_interleave,
-        e2m1_and_ufp8sf_scale_to_float=e2m1_and_ufp8sf_scale_to_float,
+        fp4_quantize_sm100=fp4_quantize_sm100,
+        block_scale_interleave_sm100=block_scale_interleave_sm100,
+        e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
         mxfp4_dequantize_host=mxfp4_dequantize_host,
     )
 
@@ -302,11 +301,6 @@ def fp4_quantize(
     """
 
     # check to make sure device is supported
-    if get_device_arch() != "100a":
-        raise NotImplementedError(
-            f"Unsupported device architecture: {get_device_arch()}"
-        )
-
     if sf_vec_size != 16 and sf_vec_size != 32:
         raise NotImplementedError("sf_vec_size can only be 16 or 32")
 
@@ -318,7 +312,7 @@ def fp4_quantize(
     assert input.shape[-1] % sf_vec_size == 0
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    x_q, sf = get_fp4_quantization_module().fp4_quantize(
+    x_q, sf = get_fp4_quantization_module().fp4_quantize_sm100(
         input,
         global_scale,
         sf_vec_size,
@@ -350,16 +344,11 @@ def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     Raises:
         AssertionError: If input dtype is not uint8.
     """
-    if get_device_arch() != "100a":
-        raise NotImplementedError(
-            f"Unsupported device architecture: {get_device_arch()}"
-        )
-
     # TODO(shuw): check input dtype is uint8
     assert unswizzled_sf.dtype == torch.uint8, (
         f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
     )
-    return get_fp4_quantization_module().block_scale_interleave(
+    return get_fp4_quantization_module().block_scale_interleave_sm100(
         unswizzled_sf,
     )
 
@@ -393,12 +382,7 @@ def e2m1_and_ufp8sf_scale_to_float(
         torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
 
     """
-    if get_device_arch() != "100a":
-        raise NotImplementedError(
-            f"Unsupported device architecture: {get_device_arch()}"
-        )
-
-    return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float(
+    return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_tensor,
         ufp8_scale_tensor,
         global_scale_tensor,
@@ -477,11 +461,6 @@ def nvfp4_quantize(
             - Scale factors tensor with shape determined by layout and sf_vec_size
     """
 
-    if get_device_arch() != "100a":
-        raise NotImplementedError(
-            f"Unsupported device architecture: {get_device_arch()}"
-        )
-
     if do_shuffle:
         # Weights 128x4 + shuffle. It is done during the model load and we do not care much about the perf
         assert sfLayout == SfLayout.layout_128x4
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
index efed07234..5bc6d0f7f 100644
--- a/tests/test_fp4_quantize.py
+++ b/tests/test_fp4_quantize.py
@@ -301,11 +301,12 @@ def test_mxfp4_quantize_roundtrip():
     quant_a, sfs = mxfp4_quantize(x)
     dq_a = mxfp4_dequantize(quant_a, sfs)
 
-    print("x: ", x)
-    print("dq_a: ", dq_a)
-
     torch.testing.assert_close(
-        dq_a, x, rtol=0.3, atol=0.5, msg="Quantize -> dequantize mxfp4 roundtrip failed"
+        dq_a.cpu().to(torch.float32),
+        x.cpu().to(torch.float32),
+        rtol=0.3,
+        atol=0.5,
+        msg="Quantize -> dequantize mxfp4 roundtrip failed",
     )
 
 

From e8d94a8c980f25bcd420869a32c3311badef79e7 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Thu, 14 Aug 2025 10:42:43 -0700
Subject: [PATCH 08/12] minor

---
 csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp | 4 +---
 flashinfer/fp4_quantization.py                     | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
index 1de11895f..fb21cf6ac 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -144,6 +144,4 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
 }
 }  // namespace torch_ext
 
-TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
-  m.def("fp4_quantize_sm100", &torch_ext::fp4_quantize);
-}
+TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) { m.def("fp4_quantize", &torch_ext::fp4_quantize); }
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 5984d34d5..c22acad45 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -129,7 +129,7 @@ def fp4_quantize_sm100(
         """
         if enable_pdl is None:
             enable_pdl = device_support_pdl(input.device)
-        return module.fp4_quantize_sm100(
+        return module.fp4_quantize(
             input,
             global_scale,
             sf_vec_size,
@@ -140,7 +140,7 @@ def fp4_quantize_sm100(
         )
 
     @register_fake_op("flashinfer::fp4_quantize_sm100")
-    def _fake_fp4_quantize(
+    def _fake_fp4_quantize_sm100(
         input: torch.Tensor,
         global_scale: Optional[torch.Tensor] = None,
         sf_vec_size: int = 16,

From 5949771b877f230067466433e58e134e715a1235 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Thu, 14 Aug 2025 10:44:13 -0700
Subject: [PATCH 09/12] typo

---
 flashinfer/fp4_quantization.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index c22acad45..6e18955cf 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -299,8 +299,6 @@ def fp4_quantize(
             - FP8 input when FP8 is not enabled
             - sf_vec_size other than 16 or 32
     """
-
-    # check to make sure device is supported
     if sf_vec_size != 16 and sf_vec_size != 32:
         raise NotImplementedError("sf_vec_size can only be 16 or 32")
 

From da991304770cd6a5b4bc49fc8f834be5670b30d9 Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Thu, 14 Aug 2025 12:07:15 -0700
Subject: [PATCH 10/12] device_arch updates

---
 flashinfer/fp8_quantization.py         |  6 ------
 flashinfer/utils.py                    |  7 -------
 tests/test_trtllm_cutlass_fused_moe.py | 13 ++++++++-----
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
index 57b7122cf..86fd3062d 100644
--- a/flashinfer/fp8_quantization.py
+++ b/flashinfer/fp8_quantization.py
@@ -9,7 +9,6 @@
 from .jit import gen_jit_spec, sm100a_nvcc_flags
 from .utils import (
     device_support_pdl,
-    get_device_arch,
     register_custom_op,
     register_fake_op,
 )
@@ -161,11 +160,6 @@ def mxfp8_quantize(
             - Quantized tensor of shape [M, K] with dtype FLOAT8_E4M3
             - Scale factors tensor with shape determined by layout and sf_vec_size
     """
-    if get_device_arch() != "100a":
-        raise NotImplementedError(
-            f"Unsupported device architecture: {get_device_arch()}"
-        )
-
     sf_vec_size = 32
 
     assert input.shape[-1] % sf_vec_size == 0
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
index 05cdbc5b8..faddc0e35 100644
--- a/flashinfer/utils.py
+++ b/flashinfer/utils.py
@@ -418,13 +418,6 @@ def version_at_least(version: str, base_version: str) -> bool:
     return pkg_version.parse(version) >= pkg_version.parse(base_version)
 
 
-@functools.cache
-def get_device_arch():
-    major, minor = torch.cuda.get_device_capability()
-    suffix = "a" if major >= 9 else ""
-    return f"{major * 10 + minor}{suffix}"
-
-
 def is_sm90a_supported(device: torch.device) -> bool:
     major, _ = get_compute_capability(device)
     return major == 9 and version_at_least(torch.version.cuda, "12.3")
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index 24b88fa87..d680e9eab 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -17,7 +17,6 @@
 import pytest
 import torch
 from torch.nn import functional as F
-from flashinfer.utils import get_device_arch
 
 import flashinfer.fused_moe as fused_moe
 from flashinfer import (
@@ -363,7 +362,8 @@ def test_moe_fp8(
 )
 @pytest.mark.parametrize("quantized_input", [False, True])
 @pytest.mark.skipif(
-    get_device_arch() != "100a", reason="NVFP4 is only supported on SM100a"
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="NVFP4 is only supported on SM100",
 )
 def test_moe_nvfp4(
     batch_size,
@@ -939,7 +939,8 @@ def transform_dim(a: torch.Tensor, dim: int = -1) -> torch.Tensor:
 @pytest.mark.parametrize("top_k", TOP_K_VALUES)
 @pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
 @pytest.mark.skipif(
-    get_device_arch() != "100a", reason="FP8 block scaling is only supported on SM100a"
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="FP8 block scaling is only supported on SM100",
 )
 def test_moe_fp8_block_scaling(
     batch_size, hidden_size, num_experts, top_k, intermediate_size
@@ -1092,7 +1093,8 @@ def dequant_mxfp4_batches(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
 @pytest.mark.skipif(
-    get_device_arch() != "100a", reason="MXFP8xMXFP4 is only supported on SM100a"
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="MXFP8xMXFP4 is only supported on SM100",
 )
 def test_moe_mxfp8_mxfp4(
     batch_size,
@@ -1235,7 +1237,8 @@ def dequant_mxfp4_batches_host(
     ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
 )
 @pytest.mark.skipif(
-    get_device_arch() != "90a", reason="BF16xMXFP4 is only supported on SM90a"
+    torch.cuda.get_device_capability()[0] != 9,
+    reason="BF16xMXFP4 is only supported on SM90",
 )
 def test_moe_bf16_mxfp4(
     batch_size,

From 5643ed117802908607c33fca88f4ef0a666aaa8b Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Mon, 18 Aug 2025 10:04:38 -0700
Subject: [PATCH 11/12] make fused_moe jit arch dependent

---
 flashinfer/fused_moe/core.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 76f7301cf..d248f932d 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -202,7 +202,8 @@ def gen_cutlass_fused_moe_module(
     Generate a JitSpec for the cutlass fused moe module.
     """
     output_dir = (
-        jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
+        jit_env.FLASHINFER_CSRC_DIR
+        / f"nv_internal/tensorrt_llm/cutlass_instantiations/{device_arch}"
     )
 
     try:
@@ -218,7 +219,7 @@ def gen_cutlass_fused_moe_module(
         raise RuntimeError(f"Failed to generate Cutlass kernels: {e}") from e
 
     return gen_jit_spec(
-        "fused_moe",
+        f"fused_moe_{device_arch}",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu",

From 6af38267b1b1a1aada43f53c5c37c79db26e732b Mon Sep 17 00:00:00 2001
From: Duncan Moss <djm.moss@gmail.com>
Date: Mon, 18 Aug 2025 13:38:04 -0700
Subject: [PATCH 12/12] remove un-needed debug

---
 .../fpA_intB_gemm/fpA_intB_gemm_template.h     | 18 +-----------------
 .../fpA_intB_gemm_template_sm90.h              |  5 -----
 .../launchers/fpA_intB_launcher_sm90.inl       |  2 --
 .../moe_gemm_tma_ws_mixed_input_launcher.inl   |  2 --
 ...gemm_template_dispatch_tma_ws_mixed_dtype.h |  3 ---
 5 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
index 0fc953671..14ba601b3 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -55,8 +55,6 @@ void generic_mixed_gemm_kernelLauncher(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
 #ifdef ENABLE_BF16
   static_assert(
 #ifdef ENABLE_FP8
@@ -231,7 +229,6 @@ void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B,
                                int const group_size, tkc::CutlassGemmConfig gemm_config,
                                char* workspace, size_t workspace_bytes, cudaStream_t stream,
                                int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if constexpr (Stages > 2 && arch::kMinComputeCapability < 80) {
     // Multistage only supported on Ampere
     std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
@@ -269,7 +266,6 @@ void dispatch_gemm_config(ActivationType const* A, WeightType const* B,
                           float const alpha, OutputType* C, int m, int n, int k,
                           int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace,
                           size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.stages) {
     case 2:
       filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
@@ -312,8 +308,6 @@ void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
                               int const group_size, char* workspace, size_t workspace_bytes,
                               tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
                               int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
   // Don't instantiate configs that are not supported pre-hopper. Produce a sensible error instead.
   constexpr bool any_is_fp8 = is_fp8<ActivationType>() || is_fp8<WeightType>() ||
                               is_fp8<ScaleZeroType>() || is_fp8<BiasType>() || is_fp8<OutputType>();
@@ -394,7 +388,6 @@ template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuant
           typename ScaleZeroType, typename BiasType, typename OutputType>
 CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
                          OutputType>::CutlassFpAIntBGemmRunner() {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   int device{-1};
   tk::check_cuda_error(cudaGetDevice(&device));
   sm_ = tk::getSMVersion();
@@ -405,9 +398,7 @@ CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, Bia
 template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
           typename ScaleZeroType, typename BiasType, typename OutputType>
 CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
-                         OutputType>::~CutlassFpAIntBGemmRunner() {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-}
+                         OutputType>::~CutlassFpAIntBGemmRunner() {}
 
 template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
           typename ScaleZeroType, typename BiasType, typename OutputType>
@@ -423,7 +414,6 @@ void CutlassFpAIntBGemmRunner<
                                                tkc::CutlassGemmConfig gemm_config,
                                                char* workspace_ptr, const size_t workspace_bytes,
                                                cudaStream_t stream, int* occupancy) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if (sm_ >= 75 && sm_ < 80) {
     dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
                              cutlass::arch::Sm75, QuantOp, EpilogueTag>(
@@ -475,7 +465,6 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) ||
                 (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)) {
     dispatch_to_arch<tkc::EpilogueOpBias>(
@@ -498,7 +487,6 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   gemm(A, B, weight_scales, weight_zero_points, biases, 1.f, C, m, n, k, group_size, gemmConfig,
        workspace_ptr, workspace_bytes, stream);
 }
@@ -512,8 +500,6 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
   if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY) {
     dispatch_to_arch<tkc::EpilogueOpBias>((ActivationType const*)A, (WeightType const*)B,
                                           (ScaleZeroType const*)weight_scales, nullptr, nullptr,
@@ -533,7 +519,6 @@ void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType
                                                 int k, tkc::CutlassGemmConfig gemmConfig,
                                                 char* workspace_ptr, const size_t workspace_bytes,
                                                 cudaStream_t stream) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   gemm(A, B, weight_scales, 1.f, C, m, n, k, gemmConfig, workspace_ptr, workspace_bytes, stream);
 }
 
@@ -558,7 +543,6 @@ template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuant
 size_t CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
                                 OutputType>::getWorkspaceSize(int const m, int const n,
                                                               int const k) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // For Hopper, we have to allocate large memory size in case for stream-K
   if (sm_ == 90) {
     // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L878-L892
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
index d3219b105..a81fffde9 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
@@ -43,7 +43,6 @@ void sm90_dispatch_epilogue_schedules(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.epilogue_schedule) {
     case tkc::EpilogueScheduleType::AUTO:
       using EpilogueScheduleType =
@@ -106,8 +105,6 @@ void sm90_dispatch_mainloop_schedules(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
   constexpr bool tile_shapes_supported = are_tile_shapes_supported<CTAShape, ClusterShape>();
 
   if constexpr (tile_shapes_supported) {
@@ -149,7 +146,6 @@ void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B,
                                int const group_size, tkc::CutlassGemmConfig gemm_config,
                                char* workspace, size_t workspace_bytes, cudaStream_t stream,
                                int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (gemm_config.cluster_shape) {
     case tkc::ClusterShape::ClusterShape_1x1x1:
       sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
@@ -196,7 +192,6 @@ void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
                                    int const group_size, char* workspace, size_t workspace_bytes,
                                    tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
                                    int* occupancy = nullptr) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
index cc91b4ba6..052f388b8 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -60,8 +60,6 @@ void sm90_generic_mixed_gemm_kernelLauncher(
     ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
     OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
     char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
 #ifdef COMPILE_HOPPER_TMA_GEMMS
   using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index b97e65a08..e28cb7b12 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -66,8 +66,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_generic_mixed_moe_gemm_kernelLauncher(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
   /////////////////////////////////////////////////////////////////////////////////////////////////
   /// GEMM kernel configurations
   /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 6b1702d58..2b13d5772 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -69,7 +69,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_mainloop_schedules(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   switch (inputs.gemm_config.mainloop_schedule) {
     case tkc::MainloopScheduleType::COOPERATIVE:
@@ -121,7 +120,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_config(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (inputs.gemm_config.cluster_shape) {
     case tkc::ClusterShape::ClusterShape_1x1x1:
       sm90_dispatch_mainloop_schedules<T, WeightType, GemmOutputType, EpilogueTag, CTAShape,
@@ -155,7 +153,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.