diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
index a81691cf9..f20729f16 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu
@@ -45,11 +45,13 @@ template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half, half>;
+template class CutlassMoeFCRunner<half, __nv_fp4_e2m1>;
 #ifdef ENABLE_BF16
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>;
 #endif
 #endif
 };  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
index df1a0ea70..231063c05 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -284,7 +284,6 @@ void buildMinLatencyActiveExpertMaps(
                      num_tokens, experts_per_token, start_expert, end_expert, num_experts_per_node,
                      smart_routing, cluster_rank, cluster_size, num_experts_smem);
 }
-
 template <int BLOCK_SIZE, int EXPERTS_PER_TOKEN, int LOG2_NUM_EXPERTS>
 __global__ void fusedBuildExpertMapsSortFirstTokenKernel(
     int const* const token_selected_experts, int* const permuted_row_to_unpermuted_row,
@@ -963,13 +962,13 @@ __device__ auto quantizePackedFPXValue(
     TmaWarpSpecializedGroupedGemmInput::ElementSF* act_sf_flat,
     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType scaling_type) {
   constexpr bool is_fp8 = std::is_same_v<QuantizedType, __nv_fp8_e4m3>;
-  static constexpr int NumThreadsPerSF = VecSize / CVT_FP4_ELTS_PER_THREAD;
+  static constexpr int NumThreadsPerSF = VecSize / CVT_ELTS_PER_THREAD;
   // Quantize the input to FP4
   static_assert(std::is_same_v<GemmOutputType, __nv_bfloat16> ||
                 std::is_same_v<GemmOutputType, half>);
-  static_assert(ComputeElem::kElements == CVT_FP4_ELTS_PER_THREAD);
+  static_assert(ComputeElem::kElements == CVT_ELTS_PER_THREAD);
   PackedVec<GemmOutputType> packed_vec{};
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
     packed_vec.elts[i].x = static_cast<GemmOutputType>(post_act_val[i * 2 + 0]);
     packed_vec.elts[i].y = static_cast<GemmOutputType>(post_act_val[i * 2 + 1]);
   }
@@ -980,10 +979,11 @@ __device__ auto quantizePackedFPXValue(
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                                   NumThreadsPerSF, VecSize>(
-      std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
+  auto sf_out =
+      cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
+          std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert,
+          QuantizationSFLayout::SWIZZLED_128x4);
 
   // Do the conversion and set the output and scaling factor
   auto func = [&]() {
@@ -1020,18 +1020,18 @@ __device__ void writeSF(int64_t num_tokens_before_expert, int64_t expert_id,
 
   // Use `token - num_tokens_before_expert` because we want this to be relative to the start of this
   // expert
-  auto sf_out = cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                                   NumThreadsPerSF, VecSize>(
-      std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
-      std::nullopt /* numRows */, num_cols, act_sf_expert, FP4QuantizationSFLayout::SWIZZLED_128x4);
+  auto sf_out =
+      cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF, NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, token_id - num_tokens_before_expert, elem_idx,
+          std::nullopt /* numRows */, num_cols / VecSize, act_sf_expert,
+          QuantizationSFLayout::SWIZZLED_128x4);
   if (sf_out) {
     if (input_sf) {
-      auto const sf_in =
-          cvt_quant_to_fp4_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
-                                             NumThreadsPerSF, VecSize>(
-              std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
-              num_cols, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
-              FP4QuantizationSFLayout::SWIZZLED_128x4);
+      auto const sf_in = cvt_quant_get_sf_out_offset<TmaWarpSpecializedGroupedGemmInput::ElementSF,
+                                                     NumThreadsPerSF>(
+          std::nullopt /* batchIdx */, source_token_id, elem_idx, std::nullopt /* numRows */,
+          num_cols / VecSize, const_cast<TmaWarpSpecializedGroupedGemmInput::ElementSF*>(input_sf),
+          QuantizationSFLayout::SWIZZLED_128x4);
       *sf_out = *sf_in;
     } else {
       *sf_out = 0x00;
@@ -1127,7 +1127,13 @@ __device__ void computeTmaWarpSpecializedInputStrides(
   if (layout_info.int4_groupwise_params.enabled) {
     layout_info.int4_groupwise_params.stride_s_a[out_idx] = cutlass::make_cute_packed_stride(
         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::StrideSFA{},
-        cute::make_shape(gemm_n, gemm_k / 128, 1));
+        cute::make_shape(
+            gemm_n,
+            gemm_k /
+                (layout_info.int4_groupwise_params.use_wfp4a16
+                     ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+                     : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size),
+            1));
   }
 }
 
@@ -1150,8 +1156,15 @@ __device__ void computeTmaWarpSpecializedInputPointers(
         safe_inc_ptr(output, num_tokens_before_expert * gemm_n);
   }
   if (layout_info.int4_groupwise_params.enabled) {
-    layout_info.int4_groupwise_params.ptr_s_a[out_idx] =
-        safe_inc_ptr(w4a8_weight_scale, expert * (gemm_n * gemm_k / 128));
+    // The group size of wfp4a16 is multiplied by 2 because each scale uses 1 byte instead of 2
+    // bytes
+    layout_info.int4_groupwise_params.ptr_s_a[out_idx] = safe_inc_ptr(
+        w4a8_weight_scale,
+        expert *
+            (gemm_n * gemm_k /
+             (layout_info.int4_groupwise_params.use_wfp4a16
+                  ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size * 2
+                  : TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size)));
   }
 }
 
@@ -1453,7 +1466,7 @@ __global__ void expandInputRowsKernel(
                                    : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
 
   constexpr int64_t ELEM_PER_THREAD = (is_nvfp4 || is_mxfp8)
-                                          ? CVT_FP4_ELTS_PER_THREAD
+                                          ? CVT_ELTS_PER_THREAD
                                           : (128 / sizeof_bits<InputActivationsType>::value);
 
   // This should be VecSize * 4 elements
@@ -1977,16 +1990,62 @@ void finalizeMoeRoutingKernelLauncher(
 // INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16);
 // #endif
 
+// ============================== Activation Adaptors =================================
+template <template <class> class ActFn>
+struct IdentityAdaptor {
+  constexpr static bool IS_GLU = false;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& x) const {
+    ActFn<T> fn{};
+    return fn(x);
+  }
+};
+
+template <template <class> class ActFn>
+struct GLUAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    ActFn<T> fn{};
+    return fn(gate) * linear;
+  }
+};
+
+struct SwigluBiasAdaptor {
+  constexpr static bool IS_GLU = true;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  float limit = std::numeric_limits<float>::infinity();
+
+  template <class T>
+  __device__ T operator()(T const& gate, T const& linear) const {
+    cutlass::epilogue::thread::Sigmoid<T> fn{};
+    T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+    T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+    return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+  }
+};
+
 // ============================== Gated Activation =================================
 constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256;
 
-template <class ActivationOutputType, class GemmOutputType, template <class> class ActFn>
+template <class ActivationOutputType, class GemmOutputType, class ActFn>
 __global__ void doGatedActivationKernel(ActivationOutputType* output,
                                         GemmOutputType const* gemm_result,
-                                        int64_t const* num_valid_tokens_ptr, int64_t inter_size) {
+                                        int64_t const* expert_first_token_offset,
+                                        int64_t inter_size, int64_t num_experts_per_node,
+                                        ActivationParams activation_type) {
   int64_t const tid = threadIdx.x;
   int64_t const token = blockIdx.x;
-  if (num_valid_tokens_ptr && token >= *num_valid_tokens_ptr) {
+  if (token >= expert_first_token_offset[num_experts_per_node]) {
     return;
   }
 
@@ -2006,42 +2065,66 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output,
   int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD;
   int64_t const inter_size_vec = inter_size / ACTIVATION_ELEM_PER_THREAD;
 
-  ActFn<ComputeElem> fn{};
+  float gate_alpha = 1.0f;
+  float gate_bias = 0.0f;
+  float gate_limit = std::numeric_limits<float>::infinity();
+  if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit) {
+    int expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node,
+                                             (int64_t)token + 1) -
+                 1;
+    gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
+    gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f;
+    gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert]
+                                              : std::numeric_limits<float>::infinity();
+  }
+
+  ActFn fn{};
+  fn.alpha = gate_alpha;
+  fn.beta = gate_bias;
+  fn.limit = gate_limit;
   for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
-    auto fc1_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+    auto linear_value = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
     // BF16 isn't supported, use FP32 for activation function
     auto gate_value =
         arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + inter_size_vec]);
-    auto gate_act = fn(gate_value);
-    output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(fc1_value * gate_act);
+    auto gate_act = fn(gate_value, linear_value);
+    output_vec[elem_index] = arrayConvert<ComputeElem, OutputElem>(gate_act);
   }
 }
 
 template <typename ActivationOutputType, typename GemmOutputType>
 void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_result,
-                       int64_t const* num_valid_tokens_ptr, int64_t inter_size, int64_t num_tokens,
-                       ActivationType activation_type, cudaStream_t stream) {
+                       int64_t const* expert_first_token_offset, int64_t inter_size,
+                       int64_t num_tokens, int64_t num_experts_per_node,
+                       ActivationParams activation_type, cudaStream_t stream) {
   int64_t const blocks = num_tokens;
   int64_t const threads = ACTIVATION_THREADS_PER_BLOCK;
 
-  auto* fn = activation_type == ActivationType::Swiglu
+  auto* fn = (activation_type == ActivationType::Swiglu)
+                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
+                                            GLUAdaptor<cutlass::epilogue::thread::SiLu>>
+             : activation_type == ActivationType::Geglu
                  ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
-                                            cutlass::epilogue::thread::SiLu>
-                 : &doGatedActivationKernel<ActivationOutputType, GemmOutputType,
-                                            cutlass::epilogue::thread::GELU>;
-  fn<<<blocks, threads, 0, stream>>>(output, gemm_result, num_valid_tokens_ptr, inter_size);
+                                            GLUAdaptor<cutlass::epilogue::thread::GELU>>
+             : activation_type == ActivationType::SwigluBias
+                 ? &doGatedActivationKernel<ActivationOutputType, GemmOutputType, SwigluBiasAdaptor>
+                 : nullptr;
+  TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type");
+  fn<<<blocks, threads, 0, stream>>>(output, gemm_result, expert_first_token_offset, inter_size,
+                                     num_experts_per_node, activation_type);
 }
 
 // ============================== Activation =================================
 
-template <class T, class GemmOutputType, class ScaleBiasType, template <class> class ActFn,
+template <class T, class GemmOutputType, class ScaleBiasType, class ActFn,
           TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType>
 __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                                    float const* fp8_quant, ScaleBiasType const* bias_ptr,
                                    bool bias_is_broadcast, int64_t const* expert_first_token_offset,
-                                   int num_experts_per_node, int64_t inter_size, bool gated,
+                                   int num_experts_per_node, int64_t inter_size,
                                    float const* fc2_act_global_scale, bool use_per_expert_act_scale,
-                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat) {
+                                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat,
+                                   ActivationParams activation_params) {
 #ifdef ENABLE_FP4
   constexpr bool IsNVFP4 =
       std::is_same_v<T, __nv_fp4_e2m1> &&
@@ -2055,12 +2138,9 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 #endif
 
   int64_t const tid = threadIdx.x;
-  size_t const gated_size_mul = gated ? 2 : 1;
-  size_t const gated_off = gated ? inter_size : 0;
-
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
-#endif
+  constexpr bool IsGated = ActFn::IS_GLU;
+  size_t gated_size_mul = IsGated ? 2 : 1;
+  size_t gated_off = IsGated ? inter_size : 0;
 
   constexpr int64_t VecSize = IsNVFP4
                                   ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -2068,7 +2148,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
   // Load 128-bits per thread, according to the smallest data type we read/write
   constexpr int64_t ACTIVATION_ELEM_PER_THREAD =
       (IsNVFP4 || IsMXFP8)
-          ? CVT_FP4_ELTS_PER_THREAD
+          ? CVT_ELTS_PER_THREAD
           : (128 / std::min(sizeof_bits<T>::value, sizeof_bits<GemmOutputType>::value));
 
   // This should be VecSize * 4 elements
@@ -2080,16 +2160,29 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
 
   int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
   for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) {
     size_t gemm_result_offset = token * inter_size * gated_size_mul;
     size_t output_offset = token * inter_size;
 
     int64_t expert = 0;
-    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale) {
+    float gate_alpha = 1.0f;
+    float gate_beta = 0.0f;
+    float gate_limit = std::numeric_limits<float>::infinity();
+    if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale ||
+        activation_params.swiglu_alpha || activation_params.swiglu_beta ||
+        activation_params.swiglu_limit) {
       // TODO this is almost certainly faster as a linear scan
       expert =
           findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) -
           1;
+      gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
+      gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
+      gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert]
+                                                  : std::numeric_limits<float>::infinity();
     }
 
     size_t act_scale_idx = use_per_expert_act_scale ? expert : 0;
@@ -2122,7 +2215,10 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0);
     int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD;
 
-    ActFn<ComputeElem> fn{};
+    ActFn fn{};
+    fn.alpha = gate_alpha;
+    fn.beta = gate_beta;
+    fn.limit = gate_limit;
     for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
       auto fc1_value =
           arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + gated_off_vec]);
@@ -2131,15 +2227,20 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                     arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index + gated_off_vec]);
       }
 
-      auto gate_act = fn(fc1_value);
+      auto gate_act = [&]() {
+        if constexpr (IsGated) {
+          auto linear_value =
+              arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
+          if (bias_ptr_vec) {
+            linear_value =
+                linear_value + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+          }
+          return fn(fc1_value, linear_value);
 
-      if (gated) {
-        auto gate_mul = arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index]);
-        if (bias_ptr_vec) {
-          gate_mul = gate_mul + arrayConvert<BiasElem, ComputeElem>(bias_ptr_vec[elem_index]);
+        } else {
+          return fn(fc1_value);
         }
-        gate_act = gate_act * gate_mul;
-      }
+      }();
 
       auto post_act_val = gate_act * quant_scale;
 
@@ -2228,7 +2329,7 @@ template <class T, class GemmOutputType, class ScaleBiasType>
 void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8_quant,
                   ScaleBiasType const* bias, bool bias_is_broadcast,
                   int64_t const* expert_first_token_offset, int num_experts_per_node,
-                  int64_t inter_size, int64_t expanded_num_tokens, ActivationType activation_type,
+                  int64_t inter_size, int64_t expanded_num_tokens, ActivationParams activation_type,
                   QuantParams const& quant_params, bool use_per_expert_act_scale,
                   TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, bool enable_pdl,
                   cudaStream_t stream) {
@@ -2249,20 +2350,29 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   auto fn = [&]() {
     auto fn = [&](auto block_scaling_type) {
       auto fn_list = std::array{
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Gelu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::ReLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
                               decltype(block_scaling_type)::value>,  // Relu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Silu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::SiLu,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              GLUAdaptor<cutlass::epilogue::thread::SiLu>,
                               decltype(block_scaling_type)::value>,  // Swiglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::GELU,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              GLUAdaptor<cutlass::epilogue::thread::GELU>,
                               decltype(block_scaling_type)::value>,  // Geglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, cutlass::epilogue::thread::Identity,
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                              decltype(block_scaling_type)::value>,  // SwigluBias
+          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                              IdentityAdaptor<cutlass::epilogue::thread::Identity>,
                               decltype(block_scaling_type)::value>  // Identity
+
       };
-      return fn_list[static_cast<int>(activation_type)];
+      return fn_list[static_cast<int>(activation_type.activation_type)];
     };
     auto NVFP4 = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,
@@ -2299,8 +2409,8 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
   config.attrs = attrs;
   cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast,
                      expert_first_token_offset, num_experts_per_node, inter_size,
-                     isGatedActivation(activation_type), quant_params.fp4.fc2.act_global_scale,
-                     use_per_expert_act_scale, fc2_act_sf_flat);
+                     quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale,
+                     fc2_act_sf_flat, activation_type);
 }
 
 // ============================== Lora Add Bias =================================
@@ -2719,7 +2829,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType,
 
   bool const is_gated_activation = isGatedActivation(activation_type);
   bool const gemm1_using_fused_moe = moe_gemm_runner_.isFusedGatedActivation(
-      *gemm1_config_, is_gated_activation, inter_size, hidden_size);
+      *gemm1_config_, activation_type, inter_size, hidden_size);
   bool const gemm1_using_tma_ws = moe_gemm_runner_.isTmaWarpSpecialized(*gemm1_config_);
   bool const tma_ws_has_glu =
       gemm1_using_tma_ws && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
@@ -2824,7 +2934,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, ScaleBiasType, Ena
     WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
     float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows,
     int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-    ActivationType fc1_activation_type, QuantParams& quant_params, bool enable_pdl,
+    ActivationParams fc1_activation_type, QuantParams& quant_params, bool enable_pdl,
     cudaStream_t stream) {
   bool const is_gated_activation = isGatedActivation(fc1_activation_type);
 
@@ -2917,7 +3027,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
     int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-    int64_t const inter_size, int const num_experts_per_node, ActivationType fc1_activation_type,
+    int64_t const inter_size, int const num_experts_per_node, ActivationParams fc1_activation_type,
     float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream,
     cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
     int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl) {
@@ -2933,8 +3043,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
 
   bool const using_tma_ws_gemm1 = gemm_runner.isTmaWarpSpecialized(config);
   bool const is_gated_activation = isGatedActivation(fc1_activation_type);
-  bool const use_ampere_activation_fusion =
-      gemm_runner.isFusedGatedActivation(config, is_gated_activation, inter_size, hidden_size);
+  bool const use_ampere_activation_fusion = gemm_runner.isFusedGatedActivation(
+      config, fc1_activation_type.activation_type, inter_size, hidden_size);
   size_t const fc1_out_size =
       ((!use_ampere_activation_fusion) && is_gated_activation) ? inter_size * 2 : inter_size;
 
@@ -3126,8 +3236,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
       using GatedActOutputType = std::conditional_t<use_w4afp8, BackBoneType, T>;
       doGatedActivation<GatedActOutputType, UnfusedGemmOutputType>(
           reinterpret_cast<GatedActOutputType*>(output),
-          static_cast<UnfusedGemmOutputType const*>(intermediate_result), num_valid_tokens_ptr,
-          inter_size, expanded_num_rows, fc1_activation_type, stream);
+          static_cast<UnfusedGemmOutputType const*>(intermediate_result), expert_first_token_offset,
+          inter_size, expanded_num_rows, num_experts_per_node, fc1_activation_type, stream);
 
       sync_check_cuda_error(stream);
     }
@@ -3235,7 +3345,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                       static_cast<UnfusedGemmOutputType const*>(gemm_output), nullptr,
                       static_cast<ScaleBiasType const*>(fc2_lora), false, expert_first_token_offset,
                       num_experts_per_node, hidden_size, expanded_num_rows,
-                      ActivationType::Identity, {}, false, nullptr, enable_pdl, stream);
+                      ActivationParams(ActivationType::Identity), {}, false, nullptr, enable_pdl,
+                      stream);
     sync_check_cuda_error(stream);
   }
 
@@ -3480,7 +3591,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     void const* input_activations_void, void const* input_sf_void,
     int const* token_selected_experts, float const* token_final_scales,
     void const* fc1_expert_weights_void, void const* fc1_expert_biases_void,
-    ActivationType fc1_activation_type, void const* fc2_expert_weights_void,
+    ActivationParams fc1_activation_type, void const* fc2_expert_weights_void,
     void const* fc2_expert_biases_void, QuantParams quant_params, int64_t const num_rows,
     int64_t const hidden_size, int64_t const inter_size, int const full_num_experts,
     int const experts_per_token, char* workspace_ptr, void* final_output_void,
@@ -3489,7 +3600,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     bool use_deepseek_fp8_block_scale, bool min_latency_mode,
     MoeMinLatencyParams& min_latency_params, bool enable_pdl, cudaStream_t stream) {
   static constexpr bool int_scales_required = std::is_same<WeightType, uint8_t>::value ||
-                                              std::is_same<WeightType, cutlass::uint4b_t>::value;
+                                              std::is_same<WeightType, cutlass::uint4b_t>::value ||
+                                              use_wfp4a16;
   static constexpr bool fp8_scales_required = std::is_same<WeightType, __nv_fp8_e4m3>::value ||
                                               std::is_same<WeightType, __nv_fp8_e5m2>::value;
 
@@ -3600,7 +3712,8 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
                          "Scales are ignored for fp32/fp16/bf16 but received quant scale for FC2");
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales &&
+                 !use_wfp4a16;
   int const num_experts_per_node = full_num_experts / parallelism_config.ep_size;
 
   configureWsPtrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts_per_node,
@@ -3664,7 +3777,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
     sync_check_cuda_error(stream);
   } else {
     bool fused_prologue_result = false;
-    if (!use_w4afp8) {
+    if (!use_w4_groupwise) {
       // WAR: fusedBuildExpertMapsSortFirstToken kernel will lead to illegal memory access for
       // W4AFP8
       fused_prologue_result = fusedBuildExpertMapsSortFirstToken(
@@ -3811,8 +3924,10 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     layout_info2.alpha_scale_ptr_array = nullptr;
   }
 
-  layout_info1.int4_groupwise_params.enabled = use_w4afp8;
-  layout_info2.int4_groupwise_params.enabled = use_w4afp8;
+  layout_info1.int4_groupwise_params.enabled = use_w4_groupwise;
+  layout_info2.int4_groupwise_params.enabled = use_w4_groupwise;
+  layout_info1.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
+  layout_info2.int4_groupwise_params.use_wfp4a16 = use_wfp4a16;
 
   layout_info1.fpX_block_scaling_type = getScalingType();
   layout_info2.fpX_block_scaling_type = getScalingType();
@@ -3858,7 +3973,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
         UnfusedGemmOutputType* output1, UnfusedGemmOutputType* output2,
         int const* num_active_experts_per, int const* active_expert_global_ids, int start_expert,
         bool enable_pdl, cudaStream_t stream) {
-  TLLM_CHECK_WITH_INFO(!use_w4afp8, "W4AFP8 is not supported in low latency mode");
+  TLLM_CHECK_WITH_INFO(!use_w4_groupwise,
+                       "W4AFP8 and WFP4A16 are not supported in low latency mode");
 
   // Always nullptr
   layout_info1.ptr_c = nullptr;
@@ -3881,6 +3997,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
   layout_info1.int4_groupwise_params.enabled = false;
   layout_info2.int4_groupwise_params.enabled = false;
+  layout_info1.int4_groupwise_params.use_wfp4a16 = false;
+  layout_info2.int4_groupwise_params.use_wfp4a16 = false;
 
   int const threads = std::min(1024, num_experts);
   int const blocks = (num_experts + threads - 1) / threads;
@@ -3911,7 +4029,7 @@ template <class T, class WeightType, class OutputType, class InputType, class Ba
 std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
 CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
-                                  ActivationType fc1_activation_type, int64_t hidden_size,
+                                  ActivationParams fc1_activation_type, int64_t hidden_size,
                                   int64_t inter_size, int64_t num_experts_per_node,
                                   void const* input_activations_void,
                                   TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
@@ -3933,7 +4051,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
     return std::make_pair(gemm1_tma_ws_input, gemm2_tma_ws_input);
   }
 
-  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales;
+  bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales &&
+                 !use_wfp4a16;
 
   bool is_gated_activation = isGatedActivation(fc1_activation_type);
   int64_t const fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size;
@@ -3969,7 +4088,8 @@ CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enable>::
 
     bool apply_bias = parallelism_config.tp_rank == 0;
     bool using_hopper_fused_finalize = !use_deterministic_hopper_reduce_ &&
-                                       gemm2_config_->sm_version == 90 && !use_w4afp8 && !use_lora;
+                                       gemm2_config_->sm_version == 90 && !use_w4_groupwise &&
+                                       !use_lora;
     if (using_hopper_fused_finalize) {
       assert(min_latency_mode == false);
       gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
@@ -4233,6 +4353,9 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   bool is_fp4_w_quant = mWType == nvinfer1::DataType::kFP4 || mWType == nvinfer1::DataType::kINT64;
   bool is_w4afp8_quant = is_int_groupwise_w_quant && is_fp8_act_quant;
   // bool is_wfp4afp8_quant = is_fp4_w_quant && is_fp8_act_quant;
+  bool is_wfp4a16_quant =
+      (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+      mWType == nvinfer1::DataType::kUINT8;
 
   // Int sizes
   size_t quant_1_size = is_int_w_quant ? fc1_out_size * num_experts_per_node * dtype_bytes : 0;
@@ -4240,7 +4363,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   if (is_int_w_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes;
-  } else if (is_int_groupwise_w_quant) {
+  } else if (is_int_groupwise_w_quant || is_wfp4a16_quant) {
     quant_1_size = fc1_out_size * num_experts_per_node * dtype_bytes * hidden_size / mGroupSize;
     quant_2_size = hidden_size * num_experts_per_node * dtype_bytes * inter_size / mGroupSize;
   }
@@ -4275,7 +4398,7 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
         TmaWarpSpecializedGroupedGemmInput::workspaceSize(num_experts_per_node, mScalingType) *
         (NUM_ROUTING_SAMPLES + 1);
 
-    if (is_w4afp8_quant) {
+    if (is_w4afp8_quant || is_wfp4a16_quant) {
       quant_3_size = 0;
       quant_4_size = 0;
     }
@@ -4293,7 +4416,8 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
       sizeof(TmaWarpSpecializedGroupedGemmInput::ElementSF);
   size_t const fp4_act_scale_flat_size = std::max(fc1_fp4_act_scale_size, fc2_fp4_act_scale_size);
 
-  size_t w4a8_alpha_size = is_w4afp8_quant ? num_experts_per_node * sizeof(float) : 0;
+  size_t w4a8_alpha_size =
+      (is_w4afp8_quant || is_wfp4a16_quant) ? num_experts_per_node * sizeof(float) : 0;
   size_t alpha_scale_ptr_array_size = num_experts_per_node * sizeof(float**);
   size_t gemm_workspace_size = mInterface->getGemmWorkspaceSize(num_experts_per_node);
 
@@ -4322,6 +4446,12 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   size_t active_expert_global_ids_size =
       mMinLatencyMode ? mNumExpertsPerNode * sizeof(int) * NUM_ROUTING_SAMPLES : 0;
 
+  bool is_swiglu_bias =
+      mActivationType == ActivationType::SwigluBias && mGemmToProfile == GemmToProfile::GEMM_1;
+  size_t swiglu_alpha_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+  size_t swiglu_beta_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+  size_t swiglu_limit_size = is_swiglu_bias ? num_experts_per_node * sizeof(float) : 0;
+
   size_t map_offset = 0;
   std::map<std::string, std::pair<size_t, size_t>> out_map;
 
@@ -4360,6 +4490,9 @@ std::map<std::string, std::pair<size_t, size_t>> GemmProfilerBackend::getProfile
   ADD(alpha_scale_ptr_array);
   ADD(fp4_act_scale_flat);
   ADD(gemm_workspace);
+  ADD(swiglu_alpha);
+  ADD(swiglu_beta);
+  ADD(swiglu_limit);
 
 #undef ADD_NAME
 #undef ADD
@@ -4447,13 +4580,16 @@ void GemmProfilerBackend::prepareQuantParams(int num_tokens, char* workspace_ptr
   GET_WS_PTR(float const*, w4a8_alpha);
 #undef GET_WS_PTR
 
-  if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4) &&
+  if ((mWType == nvinfer1::DataType::kINT8 || mWType == nvinfer1::DataType::kINT4 ||
+       mWType == nvinfer1::DataType::kUINT8) &&
       mGroupSize < 0) {
     TLLM_CHECK(quant_1 && quant_2);
     mQuantParams = QuantParams::Int(quant_1, quant_2);
-  } else if (mWType == nvinfer1::DataType::kINT4) {
+  } else if (mWType == nvinfer1::DataType::kINT4 || mWType == nvinfer1::DataType::kUINT8) {
     TLLM_CHECK(quant_1 && quant_2);
-    if (mDType == nvinfer1::DataType::kFP8) {
+    if (mDType == nvinfer1::DataType::kFP8 ||
+        (mWType == nvinfer1::DataType::kUINT8 &&
+         (mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16))) {
       TLLM_CHECK(w4a8_alpha);
       mQuantParams = QuantParams::GroupWise(mGroupSize, quant_1, quant_2, nullptr, nullptr, quant_3,
                                             quant_4, w4a8_alpha, w4a8_alpha);
@@ -4555,8 +4691,13 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr
 
       bool apply_bias = true;
       bool use_w4afp8 = (mDType == nvinfer1::DataType::kFP8 && mWType == nvinfer1::DataType::kINT4);
+      bool use_wfp4a16 =
+          ((mDType == nvinfer1::DataType::kHALF || mDType == nvinfer1::DataType::kBF16) &&
+           mWType == nvinfer1::DataType::kUINT8);
+      bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
+
       bool using_fused_finalize = !mInterface->use_deterministic_hopper_reduce_ && mSM == 90 &&
-                                  !mMinLatencyMode && !use_w4afp8;
+                                  !mMinLatencyMode && !use_w4_groupwise;
       if (using_fused_finalize) {
         assert(!mMinLatencyMode);
         gemm2_tma_ws_input.fusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE;
@@ -4661,6 +4802,10 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
   GET_WS_PTR(TmaWarpSpecializedGroupedGemmInput::ElementSF*, fp4_act_scale_flat);
   GET_WS_PTR(void*, gemm_workspace);
 
+  GET_WS_PTR(float*, swiglu_alpha);
+  GET_WS_PTR(float*, swiglu_beta);
+  GET_WS_PTR(float*, swiglu_limit);
+
 #undef GET_WS_PTR_OFFSET
 #undef GET_WS_PTR
 
@@ -4671,17 +4816,37 @@ void GemmProfilerBackend::runProfiler(int original_num_tokens, Config const& tac
 
   mInterface->is_profiler = true;
   if (mGemmToProfile == GemmToProfile::GEMM_1) {
-    mInterface->gemm1(input, output, intermediate, expert_first_token_offset, tma_ws_input_template,
-                      weights_sel, bias, expert_first_token_offset + num_experts_per_node,
-                      mQuantParams.wo.fc1_weight_scales, mQuantParams.fp8.dequant_fc1,
-                      mQuantParams.fp8_mxfp4.fc2.act_global_scale
-                          ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
-                          : mQuantParams.fp8.quant_fc2,
-                      fp4_act_scale_flat, fp4_act_scale_flat, mQuantParams, original_num_tokens,
-                      expanded_num_tokens, mExpertHiddenSize, mExpertInterSize,
-                      num_experts_per_node, mActivationType, alpha_scale_ptr_array, !mUseLora,
-                      /*use_deepseek_fp8_block_scale=*/false, stream, tactic, mMinLatencyMode,
-                      num_active_experts_per_node, active_expert_global_ids, enable_pdl);
+    mInterface->gemm1(
+        input,                                             //
+        output,                                            //
+        intermediate,                                      //
+        expert_first_token_offset,                         //
+        tma_ws_input_template,                             //
+        weights_sel,                                       //
+        bias,                                              //
+        expert_first_token_offset + num_experts_per_node,  //
+        mQuantParams.wo.fc1_weight_scales,                 //
+        mQuantParams.fp8.dequant_fc1,                      //
+        mQuantParams.fp8_mxfp4.fc2.act_global_scale ? mQuantParams.fp8_mxfp4.fc2.act_global_scale
+                                                    : mQuantParams.fp8.quant_fc2,    //
+        fp4_act_scale_flat,                                                          //
+        fp4_act_scale_flat,                                                          //
+        mQuantParams,                                                                //
+        original_num_tokens,                                                         //
+        expanded_num_tokens,                                                         //
+        mExpertHiddenSize,                                                           //
+        mExpertInterSize,                                                            //
+        num_experts_per_node,                                                        //
+        ActivationParams(mActivationType, swiglu_alpha, swiglu_beta, swiglu_limit),  //
+        alpha_scale_ptr_array,                                                       //
+        !mUseLora,                                                                   //
+        /*use_deepseek_fp8_block_scale=*/false,                                      //
+        stream,                                                                      //
+        tactic,                                                                      //
+        mMinLatencyMode,                                                             //
+        num_active_experts_per_node,                                                 //
+        active_expert_global_ids,                                                    //
+        enable_pdl);                                                                 //
   } else {
     TLLM_CHECK(mGemmToProfile == GemmToProfile::GEMM_2);
     mInterface->gemm2(
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
index 3b269ed4a..280af66ef 100644
--- a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
+++ b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu
@@ -44,6 +44,7 @@ namespace torch_ext {
 
 namespace common = tensorrt_llm::common;
 namespace kernels = CUTLASS_MOE_GEMM_KERNELS_NAMESPACE;
+using ActivationParams = CUTLASS_MOE_GEMM_NAMESPACE::ActivationParams;
 using ActivationType = CUTLASS_MOE_GEMM_NAMESPACE::ActivationType;
 // Always use public header as it is just utility functions and types
 using TmaWarpSpecializedGroupedGemmInput =
@@ -89,12 +90,12 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
   FusedMoeRunner(c10::ScalarType activation_dtype, c10::ScalarType weight_dtype,
                  c10::ScalarType output_dtype, bool use_deepseek_fp8_block_scale,
-                 bool use_w4a8_group_scaling, bool use_mxfp8_act_scaling) {
+                 bool use_w4_group_scaling, bool use_mxfp8_act_scaling) {
     mActivationDtype = activation_dtype;
     mWeightDtype = weight_dtype;
     mOutputDtype = output_dtype;
     mUseDeepSeekFP8BlockScaling = use_deepseek_fp8_block_scale;
-    mUseW4A8GroupScaling = use_w4a8_group_scaling;
+    mUseW4GroupScaling = use_w4_group_scaling;
     mUseMxfp8ActScaling = use_mxfp8_act_scaling;
     mInnerDimMultiplier = 1;
 
@@ -139,12 +140,26 @@ class FusedMoeRunner : public torch::CustomClassHolder {
           mKernelRunner = switch_output_type<__nv_fp4_e2m1, __nv_fp4_e2m1, false>(mOutputDtype);
       }
     }
+
+    if (isWFP4A16Quant()) {
+      mInnerDimMultiplier = 2;
+      if (mActivationDtype == c10::ScalarType::Half) {
+        mKernelRunner = std::make_shared<kernels::CutlassMoeFCRunner<half, __nv_fp4_e2m1>>();
+      }
+#ifdef ENABLE_BF16
+      else if (mActivationDtype == c10::ScalarType::BFloat16) {
+        mKernelRunner =
+            std::make_shared<kernels::CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>>();
+      }
+#endif
+    }
+
 #endif
     if (isInt4Quant()) {
       mInnerDimMultiplier = 2;
       if (mActivationDtype == c10::ScalarType::Half) {
 #ifdef ENABLE_FP8
-        if (mUseW4A8GroupScaling) {
+        if (mUseW4GroupScaling) {
           mKernelRunner = std::make_unique<
               kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>>();
         } else {
@@ -157,7 +172,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 #ifdef ENABLE_BF16
       else if (mActivationDtype == c10::ScalarType::BFloat16) {
 #ifdef ENABLE_FP8
-        if (mUseW4A8GroupScaling) {
+        if (mUseW4GroupScaling) {
           mKernelRunner =
               std::make_unique<kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t,
                                                            __nv_bfloat16, __nv_bfloat16>>();
@@ -195,16 +210,22 @@ class FusedMoeRunner : public torch::CustomClassHolder {
   FusedMoeRunner(FusedMoeRunner const&) = delete;
   void operator=(FusedMoeRunner const&) = delete;
 
-  at::Tensor runMoe(
-      at::Tensor& output, at::Tensor const& input, at::Tensor const& token_selected_experts,
-      torch::optional<at::Tensor> const& token_final_scales, at::Tensor const& fc1_expert_weights,
-      torch::optional<at::Tensor> const& fc1_expert_biases, at::Tensor const& fc2_expert_weights,
-      torch::optional<at::Tensor> const& fc2_expert_biases,
-      torch::optional<c10::ArrayRef<at::Tensor>> const& quant_scales,
-      torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
-      int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
-      int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
+  at::Tensor runMoe(at::Tensor& output, at::Tensor const& input,
+                    at::Tensor const& token_selected_experts,
+                    torch::optional<torch::Tensor> const& token_final_scales,
+                    torch::Tensor const& fc1_expert_weights,
+                    torch::optional<torch::Tensor> const& fc1_expert_biases,
+                    torch::Tensor const& fc2_expert_weights,
+                    torch::optional<torch::Tensor> const& fc2_expert_biases,
+                    torch::optional<c10::ArrayRef<torch::Tensor>> const& quant_scales,
+                    torch::optional<torch::Tensor> const& input_sf,
+                    torch::optional<torch::Tensor> const& swiglu_alpha,
+                    torch::optional<torch::Tensor> const& swiglu_beta,
+                    torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size,
+                    int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+                    int64_t const cluster_size, int64_t const cluster_rank,
+                    bool const enable_alltoall, bool min_latency_mode,
+                    torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
     std::lock_guard<std::mutex> lock(mMutex);
     // Free the profile workspace to save memory
     freeProfileWorkspace();
@@ -280,7 +301,33 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
     auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
     auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
-    auto activation_type = ActivationType::Swiglu;
+    ActivationType base_activation_type = ActivationType::Swiglu;
+    if (swiglu_alpha.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_alpha must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_beta.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_beta must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_limit.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_limit must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    auto activation_params = ActivationParams(
+        base_activation_type,
+        reinterpret_cast<float const*>(
+            swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
     setRunnerProfiles(profile_ids);
 
@@ -291,7 +338,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
     WorkspaceInfo workspace_info = getWorkspaceInfo(
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        activation_type, parallelism_config, min_latency_mode);
+        base_activation_type, parallelism_config, min_latency_mode);
 
     auto const quant_params =
         getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
@@ -308,7 +355,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
@@ -325,7 +372,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
@@ -343,10 +390,14 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       torch::optional<at::Tensor> const& fc1_expert_biases, at::Tensor const& fc2_expert_weights,
       torch::optional<at::Tensor> const& fc2_expert_biases,
       torch::optional<c10::ArrayRef<at::Tensor>> const& quant_scales,
-      torch::optional<at::Tensor> const& input_sf, int64_t const tp_size, int64_t const tp_rank,
-      int64_t const ep_size, int64_t const ep_rank, int64_t const cluster_size,
-      int64_t const cluster_rank, bool const enable_alltoall, bool min_latency_mode,
-      torch::optional<c10::ArrayRef<int64_t>> const& profile_ids, bool enable_pdl) {
+      torch::optional<torch::Tensor> const& input_sf,
+      torch::optional<torch::Tensor> const& swiglu_alpha,
+      torch::optional<torch::Tensor> const& swiglu_beta,
+      torch::optional<torch::Tensor> const& swiglu_limit, int64_t const tp_size,
+      int64_t const tp_rank, int64_t const ep_size, int64_t const ep_rank,
+      int64_t const cluster_size, int64_t const cluster_rank, bool const enable_alltoall,
+      bool min_latency_mode, torch::optional<c10::ArrayRef<int64_t>> const& profile_ids,
+      bool enable_pdl) {
     std::lock_guard<std::mutex> lock(mMutex);
 
     // Free the profile workspace to save memory
@@ -405,11 +456,37 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int64_t num_rows = input.sizes()[0];
     int64_t hidden_size = fc2_expert_weights.sizes()[1];
     int64_t inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
+
     int const num_experts_on_rank = fc2_expert_weights.sizes()[0];
     auto const num_experts_total = static_cast<int>(num_experts_on_rank * ep_size);
-    auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank,
-                                                            cluster_size, cluster_rank);
-    auto activation_type = ActivationType::Swiglu;
+    auto parallelism_config = kernels::MOEParallelismConfig(tp_size, tp_rank, ep_size, ep_rank);
+    ActivationType base_activation_type = ActivationType::Swiglu;
+    if (swiglu_alpha.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_alpha.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_alpha.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_alpha must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_beta.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_beta.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_beta.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_beta must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    if (swiglu_limit.has_value()) {
+      CHECK_INPUT_AND_TYPE(swiglu_limit.value(), at::ScalarType::Float);
+      TORCH_CHECK(swiglu_limit.value().sizes()[0] == num_experts_on_rank,
+                  "swiglu_limit must have num_experts_on_rank elements.");
+      base_activation_type = ActivationType::SwigluBias;
+    }
+    auto activation_params = ActivationParams(
+        base_activation_type,
+        reinterpret_cast<float const*>(
+            swiglu_alpha.has_value() ? swiglu_alpha.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_beta.has_value() ? swiglu_beta.value().const_data_ptr() : nullptr),
+        reinterpret_cast<float const*>(
+            swiglu_limit.has_value() ? swiglu_limit.value().const_data_ptr() : nullptr));
 
     setRunnerProfiles(profile_ids);
 
@@ -435,7 +512,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
     WorkspaceInfo workspace_info = getWorkspaceInfo(
         num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        activation_type, parallelism_config, min_latency_mode);
+        base_activation_type, parallelism_config, min_latency_mode);
 
     auto const quant_params =
         getQuantParams(num_experts_on_rank, hidden_size, inter_size, quant_scales);
@@ -451,7 +528,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token),
@@ -468,7 +545,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
             : nullptr,
         fc1_expert_weights.const_data_ptr(),
         fc1_expert_biases.has_value() ? fc1_expert_biases.value().const_data_ptr() : nullptr,
-        activation_type, fc2_expert_weights.const_data_ptr(),
+        activation_params, fc2_expert_weights.const_data_ptr(),
         fc2_expert_biases.has_value() ? fc2_expert_biases.value().const_data_ptr() : nullptr,
         quant_params, num_rows, hidden_size, inter_size, num_experts_total,
         static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
@@ -505,7 +582,13 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     int64_t const num_rows = input.sizes()[0];
     int64_t const hidden_size = fc2_expert_weights.sizes()[1];
     int64_t const inter_size = fc2_expert_weights.sizes()[2] * mInnerDimMultiplier;
-    int64_t const group_size = isInt4Quant() ? 128 : -1;
+    int64_t const group_size_ =
+        isInt4Quant() ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size
+                      : -1;
+    int64_t const group_size =
+        isWFP4A16Quant()
+            ? TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size
+            : group_size_;
     int const num_experts = static_cast<int>(fc2_expert_weights.sizes()[0] * ep_size);
 
     // Get specific profile configs according to the profile_id.
@@ -532,8 +615,9 @@ class FusedMoeRunner : public torch::CustomClassHolder {
 
       bool const USE_BIAS = fc1_expert_biases.has_value() || fc2_expert_biases.has_value();
       bool const USE_LORA = false;
-      auto activation_dtype =
-          mUseW4A8GroupScaling ? at::ScalarType::Float8_e4m3fn : mActivationDtype;
+      auto activation_dtype = (mUseW4GroupScaling && !isWFP4A16Quant())
+                                  ? at::ScalarType::Float8_e4m3fn
+                                  : mActivationDtype;
       activation_dtype = isNvfp4Quant() ? at::ScalarType::Long : activation_dtype;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
@@ -585,7 +669,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
   char* mProfileWorkspace = nullptr;
 
   bool mUseDeepSeekFP8BlockScaling = false;
-  bool mUseW4A8GroupScaling = false;
+  bool mUseW4GroupScaling = false;
   bool mUseMxfp8ActScaling = false;
 
   using Profile = tensorrt_llm::cutlass_extensions::CutlassGemmConfig;
@@ -631,7 +715,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
     size_t moe_workspace_size = mKernelRunner->getWorkspaceSize(
         num_rows, hidden_size, inter_size, num_experts, experts_per_token, activation_type,
         parallelismConfig, /* use_lora */ false, mUseDeepSeekFP8BlockScaling, min_latency_mode,
-        mUseW4A8GroupScaling);
+        mUseW4GroupScaling);
     size_t src_to_dest_map_size = experts_per_token * num_rows * sizeof(int);
 
     std::vector<size_t> workspaces{moe_workspace_size, src_to_dest_map_size};
@@ -783,8 +867,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
               fc1_weight_block.sizes()[2] * FP8_PER_INT32 *
                       TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize ==
                   TmaWarpSpecializedGroupedGemmInput::alignToSfDim(
-                      hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX) *
-                      TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX,
+                      hidden_size, TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX),
           "fc1 weight block size must be (num_experts_on_rank, inter_size * 2, hidden_size // 4 // "
           "block_scale_vector_size)");
       TORCH_CHECK(fc1_global.sizes()[0] == num_experts_on_rank,
@@ -890,6 +973,18 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       return kernels::QuantParams::FP8BlockScaling(
           static_cast<float const*>(fc1_scales.data_ptr()),
           static_cast<float const*>(fc2_scales.data_ptr()));
+    } else if (isWFP4A16Quant()) {
+      TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for W4 quantization");
+      TORCH_CHECK(quant_scales.value().size() == 2,
+                  "Expecting 2 quant scales for W4A16 quantization");
+
+      auto& fc1_weight_scales = quant_scales.value()[0];
+      auto& fc2_weight_scales = quant_scales.value()[1];
+      int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::wfp4a16_group_size;
+      return kernels::QuantParams::GroupWise(group_size,
+                                             static_cast<void const*>(fc1_weight_scales.data_ptr()),
+                                             static_cast<void const*>(fc2_weight_scales.data_ptr()),
+                                             nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
     } else if (isInt4Quant()) {
       TORCH_CHECK(quant_scales.has_value(), "Expecting quant scales for INT4 quantization");
       TORCH_CHECK(quant_scales.value().size() == 8,
@@ -902,7 +997,7 @@ class FusedMoeRunner : public torch::CustomClassHolder {
       auto& fc2_weight_zeros = quant_scales.value()[5];
       auto& fc1_alpha = quant_scales.value()[6];
       auto& fc2_alpha = quant_scales.value()[7];
-      int group_size = 128;
+      int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size;
       return kernels::QuantParams::GroupWise(
           group_size, static_cast<void const*>(fc1_weight_scales.data_ptr()),
           static_cast<void const*>(fc2_weight_scales.data_ptr()),
@@ -931,6 +1026,10 @@ class FusedMoeRunner : public torch::CustomClassHolder {
            mActivationDtype != c10::ScalarType::Float8_e4m3fn;  // FP8 activation does not use FP4
   }
 
+  bool isWFP4A16Quant() const {
+    return mUseW4GroupScaling && mWeightDtype == c10::ScalarType::Byte;
+  }
+
   bool isInt4Quant() const { return mWeightDtype == c10::ScalarType::QUInt4x2; }
 
   bool isW4AFp8Quant() const {
diff --git a/csrc/nv_internal/cpp/kernels/quantization.cu b/csrc/nv_internal/cpp/kernels/quantization.cu
index e9ae62830..95123f3e3 100644
--- a/csrc/nv_internal/cpp/kernels/quantization.cu
+++ b/csrc/nv_internal/cpp/kernels/quantization.cu
@@ -75,14 +75,14 @@ template void invokeQuantization<__nv_bfloat16>(int8_t* dst, __nv_bfloat16 const
 
 template <typename T>
 void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
-                             int32_t* SFOuput, FP4QuantizationSFLayout layout,
-                             int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
+                             int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount,
+                             bool enable_pdl, cudaStream_t stream) {
   // Fixed SF_VEC_SIZE as 32
   static constexpr int SF_VEC_SIZE = 32;
 
   // Grid, Block size.
   // Each thread converts 8 values.
-  dim3 block(std::min(int(padded_n / CVT_FP4_ELTS_PER_THREAD), 512));
+  dim3 block(std::min(int(padded_n / CVT_ELTS_PER_THREAD), 512));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 2048u / block.x);
   dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
@@ -166,9 +166,10 @@ INSTANTIATE_INVOKE_PER_TOKEN_QUANTIZATION(__nv_bfloat16, __nv_fp8_e4m3);
 // FP4/MXFP8 Quantization
 
 template <typename T, int SF_VEC_SIZE>
-void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
-                           int multiProcessorCount, bool enable_pdl, cudaStream_t stream) {
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           QuantizationSFLayout layout, int multiProcessorCount, bool enable_pdl,
+                           cudaStream_t stream) {
 #ifdef ENABLE_FP8
   if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
     // Grid, Block size.
@@ -179,73 +180,32 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, i
     dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp8_to_fp4<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4<SF_VEC_SIZE, false>;
-    kernel_instance<<<grid, block, 0, stream>>>(m, n, input, SFScale,
-                                                reinterpret_cast<uint64_t*>(output),
-                                                reinterpret_cast<uint32_t*>(SFOuput), layout);
-  } else
-#endif
-  {
-    // Grid, Block size.
-    // Each thread converts 8 values.
-    dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
-    // Get number of blocks per SM (assume we can fully utilize the SM).
-    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
-
-    // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp16_to_fp4<T, SF_VEC_SIZE, true> : &cvt_fp16_to_fp4<T, SF_VEC_SIZE, false>;
-    cudaLaunchConfig_t config;
-    config.gridDim = grid;
-    config.blockDim = block;
-    config.dynamicSmemBytes = 0;
-    config.stream = stream;
-    cudaLaunchAttribute attrs[1];
-    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
-    config.numAttrs = 1;
-    config.attrs = attrs;
-    cudaLaunchKernelEx(&config, kernel_instance, m, n, input, SFScale,
-                       reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
-                       layout);
-  }
-}
-
-template <typename T, int SF_VEC_SIZE>
-void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float const* SFScale,
-                                  int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                  int multiProcessorCount, FP4QuantizationSFLayout layout,
-                                  bool enable_pdl, cudaStream_t stream) {
-#ifdef ENABLE_FP8
-  if constexpr (std::is_same_v<T, __nv_fp8_e4m3>) {
-    // Grid, Block size.
-    // Each thread converts 16 values.
-    dim3 block(std::min(int(n / CVT_FP8_TO_FP4_ELTS_PER_THREAD), 512));
-    // Get number of blocks per SM (assume we can fully utilize the SM).
-    int const numBlocksPerSM = std::max(1u, 2048u / block.x);
-    dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
-
-    // Launch the cvt kernel.
-    auto* kernel_instance =
-        useUE8M0 ? &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, true> : &cvt_fp8_to_fp4_3d<SF_VEC_SIZE, false>;
-    kernel_instance<<<grid, block, 0, stream>>>(b, m, n, input, SFScale,
+    auto* kernel_instance = useUE8M0
+                                ? &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4,
+                                                            T, SF_VEC_SIZE, true>
+                                : &quantize_with_block_size<BlockScaleQuantizationType::FP8_TO_FP4,
+                                                            T, SF_VEC_SIZE, false>;
+    kernel_instance<<<grid, block, 0, stream>>>(b, m, n, n, input, SFScale,
                                                 reinterpret_cast<uint32_t*>(output),
                                                 reinterpret_cast<uint32_t*>(SFOuput), layout);
+
   } else
 #endif
   {
     // Grid, Block size.
     // Each thread converts 8 values.
-    dim3 block(std::min(int(n / CVT_FP4_ELTS_PER_THREAD), 512));
+    dim3 block(std::min(int(n / CVT_ELTS_PER_THREAD), 512));
     // Get number of blocks per SM (assume we can fully utilize the SM).
     int const numBlocksPerSM = std::max(1u, 2048u / block.x);
     dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
 
     // Launch the cvt kernel.
-    auto* kernel_instance = useUE8M0 ? &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, true>
-                                     : &cvt_fp16_to_fp4_3d<T, SF_VEC_SIZE, false>;
+    auto* kernel_instance = useUE8M0
+                                ? &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4,
+                                                            T, SF_VEC_SIZE, true>
+                                : &quantize_with_block_size<BlockScaleQuantizationType::FP16_TO_FP4,
+                                                            T, SF_VEC_SIZE, false>;
+
     cudaLaunchConfig_t config;
     config.gridDim = grid;
     config.blockDim = block;
@@ -256,16 +216,15 @@ void invokeBatchedFP4Quantization(int b, int m, int n, T const* input, float con
     attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
     config.numAttrs = 1;
     config.attrs = attrs;
-    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, input, SFScale,
+    cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale,
                        reinterpret_cast<uint32_t*>(output), reinterpret_cast<uint32_t*>(SFOuput),
                        layout);
   }
 }
 
-__global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
-                                                    int numCols, int numColsPadded,
-                                                    uint8_t const* SFIn, uint8_t* SFOutput) {
-  constexpr int SF_VEC_SIZE = 16;
+__global__ void block_scale_interleave_kernel(int numBatches, int numRows, int numRowsPadded,
+                                              int numCols, int numColsPadded, uint8_t const* SFIn,
+                                              uint8_t* SFOutput) {
   for (int rowIdx = blockIdx.x; rowIdx < numRowsPadded; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numColsPadded; colIdx += blockDim.x) {
@@ -282,18 +241,15 @@ __global__ void nvfp4_block_scale_interleave_kernel(int numBatches, int numRows,
         // int const numSfTilesK = (numCols + 4 - 1) / 4;
         // int const tileOffset = ((mi / 128) * numSfTilesK + ki / 4) * 512;
         // int const dstIdx = tileOffset + (mi % 32) * 16 + ((mi % 128) / 32) * 4 + ki % 4;
-        auto dstIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdxOpt, rowIdx, colIdx, numRowsOpt,
-                                                           numCols * SF_VEC_SIZE);
+        auto dstIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
         SFOutput[dstIdx] = sf;
       }
     }
   }
 }
 
-__global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int numRows,
-                                                            int numCols, uint8_t const* SFIn,
-                                                            uint8_t* SFOutput) {
-  constexpr int SF_VEC_SIZE = 16;
+__global__ void block_scale_interleave_reverse_kernel(int numBatches, int numRows, int numCols,
+                                                      uint8_t const* SFIn, uint8_t* SFOutput) {
   for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
     for (int batchIdx = 0; batchIdx < numBatches; batchIdx++) {
       for (int colIdx = threadIdx.x; colIdx < numCols; colIdx += blockDim.x) {
@@ -301,8 +257,7 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int
         std::optional<int> numRowsOpt = numRows;
 
         // Get the swizzled input index using the same swizzling pattern
-        auto srcIdx = get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdxOpt, rowIdx, colIdx, numRowsOpt,
-                                                           numCols * SF_VEC_SIZE);
+        auto srcIdx = get_sf_out_offset_128x4(batchIdxOpt, rowIdx, colIdx, numRowsOpt, numCols);
         auto sf = SFIn[srcIdx];
 
         // Output goes to linear layout
@@ -314,110 +269,74 @@ __global__ void nvfp4_block_scale_interleave_reverse_kernel(int numBatches, int
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                     uint8_t const* SFIn, uint8_t* SFOutput,
-                                     int multiProcessorCount, cudaStream_t stream) {
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
+                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
+                                cudaStream_t stream) {
   // Each thread reads 1 int8 value
   dim3 block(std::min(n_padded, 1024));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m_padded, multiProcessorCount * numBlocksPerSM));
 
-  nvfp4_block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn,
-                                                                  SFOutput);
+  block_scale_interleave_kernel<<<grid, block, 0, stream>>>(b, m, m_padded, n, n_padded, SFIn,
+                                                            SFOutput);
 }
 
 // This is intended for weight loading, so m and n are large, b <= 256
-void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
-                                            uint8_t* SFOutput, int multiProcessorCount,
-                                            cudaStream_t stream) {
+void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
+                                       int multiProcessorCount, cudaStream_t stream) {
   // Each thread reads 1 int8 value
   dim3 block(std::min(n, 1024));
   // Get number of blocks per SM (assume we can fully utilize the SM).
   int const numBlocksPerSM = std::max(1u, 4096u / block.x);
   dim3 grid(std::min(m, multiProcessorCount * numBlocksPerSM));
 
-  nvfp4_block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
+  block_scale_interleave_reverse_kernel<<<grid, block, 0, stream>>>(b, m, n, SFIn, SFOutput);
 }
 
 // Instantiate the function.
-template void invokeFP4Quantization<half, 16>(int m, int n, half const* input, float const* SFScale,
-                                              int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              FP4QuantizationSFLayout layout,
-                                              int multiProcessorCount, bool enable_pdl,
-                                              cudaStream_t stream);
-template void invokeFP4Quantization<half, 32>(int m, int n, half const* input, float const* SFScale,
-                                              int64_t* output, int32_t* SFOuput, bool useUE8M0,
-                                              FP4QuantizationSFLayout layout,
-                                              int multiProcessorCount, bool enable_pdl,
-                                              cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 16>(int b, int m, int n, half const* input,
-                                                     float const* SFScale, int64_t* output,
-                                                     int32_t* SFOuput, bool useUE8M0,
-                                                     int multiProcessorCount,
-                                                     FP4QuantizationSFLayout layout,
-                                                     bool enable_pdl, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<half, 32>(int b, int m, int n, half const* input,
-                                                     float const* SFScale, int64_t* output,
-                                                     int32_t* SFOuput, bool useUE8M0,
-                                                     int multiProcessorCount,
-                                                     FP4QuantizationSFLayout layout,
-                                                     bool enable_pdl, cudaStream_t stream);
+template void invokeFP4Quantization<half, 16>(int b, int m, int n, half const* input,
+                                              float const* SFScale, int64_t* output,
+                                              int32_t* SFOuput, bool useUE8M0,
+                                              QuantizationSFLayout layout, int multiProcessorCount,
+                                              bool enable_pdl, cudaStream_t stream);
+template void invokeFP4Quantization<half, 32>(int b, int m, int n, half const* input,
+                                              float const* SFScale, int64_t* output,
+                                              int32_t* SFOuput, bool useUE8M0,
+                                              QuantizationSFLayout layout, int multiProcessorCount,
+                                              bool enable_pdl, cudaStream_t stream);
 template void invokeMxFP8Quantization<half>(int b, int m, int n, int padded_n, half const* input,
                                             int64_t* output, int32_t* SFOuput,
-                                            FP4QuantizationSFLayout layout, int multiProcessorCount,
+                                            QuantizationSFLayout layout, int multiProcessorCount,
                                             bool enable_pdl, cudaStream_t stream);
+
 #ifdef ENABLE_BF16
-template void invokeFP4Quantization<__nv_bfloat16, 16>(int m, int n, __nv_bfloat16 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount, bool enable_pdl,
-                                                       cudaStream_t stream);
-template void invokeFP4Quantization<__nv_bfloat16, 32>(int m, int n, __nv_bfloat16 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount, bool enable_pdl,
-                                                       cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 16>(
+template void invokeFP4Quantization<__nv_bfloat16, 16>(
     int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     bool enable_pdl, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_bfloat16, 32>(
+template void invokeFP4Quantization<__nv_bfloat16, 32>(
     int b, int m, int n, __nv_bfloat16 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     bool enable_pdl, cudaStream_t stream);
 template void invokeMxFP8Quantization<__nv_bfloat16>(int b, int m, int n, int padded_n,
                                                      __nv_bfloat16 const* input, int64_t* output,
-                                                     int32_t* SFOuput,
-                                                     FP4QuantizationSFLayout layout,
+                                                     int32_t* SFOuput, QuantizationSFLayout layout,
                                                      int multiProcessorCount, bool enable_pdl,
                                                      cudaStream_t stream);
 
 #endif
 
 #ifdef ENABLE_FP8
-template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(int m, int n, __nv_fp8_e4m3 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount, bool enable_pdl,
-                                                       cudaStream_t stream);
-template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int m, int n, __nv_fp8_e4m3 const* input,
-                                                       float const* SFScale, int64_t* output,
-                                                       int32_t* SFOuput, bool useUE8M0,
-                                                       FP4QuantizationSFLayout layout,
-                                                       int multiProcessorCount, bool enable_pdl,
-                                                       cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 16>(
+template void invokeFP4Quantization<__nv_fp8_e4m3, 16>(
     int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     bool enable_pdl, cudaStream_t stream);
-template void invokeBatchedFP4Quantization<__nv_fp8_e4m3, 32>(
+template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(
     int b, int m, int n, __nv_fp8_e4m3 const* input, float const* SFScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount, FP4QuantizationSFLayout layout,
+    int32_t* SFOuput, bool useUE8M0, QuantizationSFLayout layout, int multiProcessorCount,
     bool enable_pdl, cudaStream_t stream);
+
 #endif
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
new file mode 100644
index 000000000..9059f7a52
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/util/type_traits.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+using namespace cute;
+
+typedef uint32_t __nv_fp4x8_storage_t;
+typedef uint32_t __nv_bf16x2_storage_t;
+typedef cutlass::uint128_t __nv_bf16x8_storage_t;
+
+constexpr int int4_group_size = 128;
+constexpr int mxfp4_group_size = 32;
+
+inline __device__ unsigned prmt(unsigned hi, unsigned lo, unsigned select_code) {
+  unsigned res = 0;
+
+  asm volatile(
+      "{\n"
+      "prmt.b32 %0, %1, %2, %3;\n"
+      "}\n"
+      : "=r"(res)
+      : "r"(lo), "r"(hi), "r"(select_code));
+
+  return res;
+}
+
+__device__ __inline__ __nv_fp8x4_storage_t cvt_lut_bf16(unsigned const index) {
+  const __nv_fp8x4_storage_t h4b_lut = 0x03020100U;  // 7654
+  const __nv_fp8x4_storage_t l4b_lut = 0xFFFEFC00U;  // 3210
+
+  __nv_fp8x4_storage_t lut_res = prmt(h4b_lut, l4b_lut, index);
+
+  return lut_res;
+}
+
+__device__ __inline__ __nv_bf16x8_storage_t psx_cvt_lut_prmt_fp4x8_to_bf16x8(
+    const __nv_fp4x8_storage_t fp4x8) {
+  __nv_bf16x8_storage_t bf16x8_raw = {0, 0};
+  __nv_bf16x2_storage_t* bf16x2_raw = reinterpret_cast<__nv_bf16x2_storage_t*>(&bf16x8_raw);
+
+  unsigned zero_padding = 0x00000000U;
+
+  unsigned h4b_em_fp4x4 = (fp4x8 & 0x77770000U) >> 16U;
+  unsigned l4b_em_fp4x4 = (fp4x8 & 0x00007777U);
+
+  __nv_fp8x4_storage_t h4b_2to9_bits = cvt_lut_bf16(h4b_em_fp4x4);  // 7654
+  __nv_fp8x4_storage_t l4b_2to9_bits = cvt_lut_bf16(l4b_em_fp4x4);  // 3210
+
+  bf16x2_raw[0] = prmt(zero_padding, l4b_2to9_bits, 0x1707U) >> 2U;  // 1 0
+  bf16x2_raw[1] = prmt(zero_padding, l4b_2to9_bits, 0x3727U) >> 2U;  // 3 2
+  bf16x2_raw[2] = prmt(h4b_2to9_bits, zero_padding, 0x5040U) >> 2U;  // 5 4
+  bf16x2_raw[3] = prmt(h4b_2to9_bits, zero_padding, 0x7060U) >> 2U;  // 7 6
+
+  __nv_bf16x2_storage_t bf16x2_0to1_bits;
+
+  __nv_fp8x4_storage_t h_fp8x2_0to1_bits = (fp4x8 & 0x0000C0C0U);        // 3 1
+  __nv_fp8x4_storage_t l_fp8x2_0to1_bits = (fp4x8 & 0x00000C0CU) << 4U;  // 2 0
+
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x4707U);  // 1 0
+  bf16x2_raw[0] = bf16x2_raw[0] | bf16x2_0to1_bits;
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x5717U);  // 3 2
+  bf16x2_raw[1] = bf16x2_raw[1] | bf16x2_0to1_bits;
+
+  h_fp8x2_0to1_bits = (fp4x8 & 0xC0C00000U);        // 7 5
+  l_fp8x2_0to1_bits = (fp4x8 & 0x0C0C0000U) << 4U;  // 6 4
+
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x6020U);  // 5 4
+  bf16x2_raw[2] = bf16x2_raw[2] | bf16x2_0to1_bits;
+  bf16x2_0to1_bits = prmt(h_fp8x2_0to1_bits, l_fp8x2_0to1_bits, 0x7030U);  // 7 6
+  bf16x2_raw[3] = bf16x2_raw[3] | bf16x2_0to1_bits;
+
+  return bf16x8_raw;
+}
+
+template <class Collective>
+struct MixedGroupedGemmInputUtils {
+ private:
+  using KernelSchedule = typename Collective::KernelSchedule;
+  using ConversionMode = typename Collective::ConversionMode;
+  using SmemLayoutA = typename Collective::SmemLayoutA;
+  using SmemLayoutB = typename Collective::SmemLayoutB;
+  using SmemLayoutScale = typename Collective::SmemLayoutScale;
+  using SwappedElementA = typename Collective::SwappedElementA;
+  using SwappedElementB = typename Collective::SwappedElementB;
+  using RealSwappedElementA = typename Collective::RealSwappedElementA;
+  using RealSwappedElementB = typename Collective::RealSwappedElementB;
+  using ElementScale = typename Collective::ElementScale;
+  using ElementZero = typename Collective::ElementZero;
+  using SmemCopyAtomScale = typename Collective::SmemCopyAtomScale;
+  static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
+  static constexpr auto ModeHasScales = Collective::ModeHasScales;
+  static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+  static constexpr auto UseFP4ToBF16LookupTable = Collective::UseFP4ToBF16LookupTable;
+
+ public:
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after
+  // the public section.
+  static constexpr uint32_t compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementA>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementB>));
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes =
+          cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                 static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0,
+                    "Each scale stage must be 128B aligned.");  // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes =
+            cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                   static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0,
+                      "Each zero stage must be 128B aligned.");  // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Type not handled in tma transaction bytes computation.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA, class TensorASmemView, class TensorACopyView, class... Ts,
+            class... Us>
+  CUTLASS_DEVICE static void copy_tensors_MK(SmemTiledCopyA const& smem_tiled_copy_A,
+                                             TensorASmemView const& tCsA,
+                                             TensorACopyView& tCrA_copy_view,
+                                             cute::tuple<Ts...> const& partitioned_mma_extra_info,
+                                             cute::tuple<Us...> const& tiled_copy_and_views,
+                                             int k_block, int read_stage) {
+    copy(smem_tiled_copy_A, tCsA(_, _, k_block, read_stage), tCrA_copy_view(_, _, k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage), tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage), tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut, class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(  // Accept mutable temporaries
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>&& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    lookup_table_convert(src, dst, scales_neg, scales_pos);
+  }
+
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut, class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE static void lookup_table_convert(
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>& dst,
+      Tensor<EngineScale, LayoutScale> const& scales_neg,
+      Tensor<EngineScale, LayoutScale> const& scales_pos) {
+    constexpr int N = cute::cosize(LayoutIn{});
+    static_assert(N == 4 || N == 8);
+    static_assert(cosize(LayoutScale{}) <= N / 4,
+                  "at least 4 consecutive weights must share the same scale.");
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<RealSwappedElementB, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, N / 4, sizeof(DstArray)>;
+
+    // View the input as reg
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r = cute::recast<RegArray>(dst)(0);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign;  // ((reg & 0x88888888) | 0x64206420) >> 1
+    asm volatile(
+        "{\n"
+        "  lop3.b32 %0, %1, %2, %3, %4;\n"
+        "}\n"
+        : "=r"(sign)
+        : "r"(src_reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut));
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = src_reg & 0x77777777;
+    Tensor scales_neg_ = cute::filter(scales_neg);
+    Tensor scales_pos_ = cute::filter(scales_pos);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>= 16, sign >>= 16) {
+      auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_(i));
+      auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_pos_(i));
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos, neg                    ;\n"
+          "  prmt .b32 neg, %3, %4, %1             ;\n"
+          "  prmt .b32 pos, %5, %6, %1             ;\n"
+          "  prmt .b32 %0, pos, neg, %2            ;\n"
+          "}\n"
+          : "=r"(r[i])
+          : "r"(lut_idx), "r"(sign), "r"(scale_neg_[0]), "r"(scale_neg_[1]), "r"(scale_pos_[0]),
+            "r"(scale_pos_[1]));
+    }
+  }
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn, class LayoutIn, class EngineOut,
+            class LayoutOut>
+  CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert(  // Accept mutable temporaries
+      Tensor<EngineIn, LayoutIn> const& src, Tensor<EngineOut, LayoutOut>&& dst) {
+    fp4tobf16_lookup_table_convert(src, dst);
+  }
+
+  template <class EngineIn, class LayoutIn, class EngineOut, class LayoutOut>
+  CUTLASS_DEVICE static void fp4tobf16_lookup_table_convert(Tensor<EngineIn, LayoutIn> const& src,
+                                                            Tensor<EngineOut, LayoutOut>& dst) {
+    // View the input as reg
+    auto&& src_ = cute::recast<__nv_fp4x8_storage_t>(src)(0);
+    auto&& dst_ = cute::recast<__nv_bf16x8_storage_t>(dst)(0);
+
+    dst_ = psx_cvt_lut_prmt_fp4x8_to_bf16x8(src_);
+  }
+
+  /// Utilities to dequantize A.
+  template <class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Layout const& tensor) {
+    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0,
+                  "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+
+  template <class Engine, class Layout>
+  CUTLASS_DEVICE static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void dequantize_A_kblock(Tensor<EngineIn, LayoutIn> const& tCrA_load,
+                                                 Tensor<EngineOut, LayoutOut>& tCrA_mma,
+                                                 cute::tuple<Ts...>& partitioned_extra_info,
+                                                 int const k_block) {
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg =
+        cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    } else if constexpr (UseScaleLookupTable) {
+      constexpr int num_elements = decltype(size(src))::value;
+      static_assert(is_same_v<RealSwappedElementA, cutlass::int4b_t>,
+                    "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64,
+                    "Lookup table only supports 8 8bit scale values now.");
+      static_assert(num_elements % 4 == 0 && num_elements >= 4,
+                    "Lookup table requires a vector size of 4x when converting.");
+
+      Tensor tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto&& tCrS_pos =
+          cute::get<2>(partitioned_extra_info);  // modification to its value is needed
+      Tensor scales_neg = tCrS_neg(_, _, k_block);
+      Tensor scales_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(cute::size(src) == cute::size(scales_neg));
+
+      static_check_scale(scales_neg);
+      static_check_scale(scales_pos);
+      Tensor scales_neg_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales_neg, Int<NumValPerSrcReg>{}));
+      Tensor scales_pos_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales_pos, Int<NumValPerSrcReg>{}));
+
+      if (k_block == 0) {
+        Tensor scales_neg_vm_ = filter(scales_neg_vm);
+        Tensor scales_pos_vm_ = filter(scales_pos_vm);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(scales_neg_vm_.layout()); ++i) {
+          auto&& scale_neg_ =
+              reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
+          auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>&>(scales_pos_vm_(i));
+          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+          asm volatile(
+              "{\n"
+              "  lop3 .b32 %0, %2, %4, %5, %6;\n"
+              "  xor  .b32 %1, %3, %5;        \n"
+              "}\n"
+              : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080),
+                "n"(immLut));
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        lookup_table_convert(src_vm(_, i), dst_vm(_, i), scales_neg_vm(_, i), scales_pos_vm(_, i));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      Tensor scales_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) *= scales_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) *= scales_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>,
+                    "ElementScale and ElementZero must be the same.");
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      Tensor zeros = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      Tensor scales_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+      Tensor zeros_vm =
+          cute::group_modes<1, -1>(cute::zipped_divide(zeros, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) = dst_vm(j, i) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+        }
+      } else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) = stage(j) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+
+  template <class EngineIn, class EngineOut, class LayoutIn, class LayoutOut, class... Ts>
+  CUTLASS_DEVICE static void convert_A_kblock(Tensor<EngineIn, LayoutIn> const& tCrA_load,
+                                              Tensor<EngineOut, LayoutOut>& tCrA_mma,
+                                              int const k_block) {
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg =
+        cute::min(decltype(size(src(_, 0)))::value, ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1, -1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    // KernelConversionMode == ConversionMode::DirectConvert
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(dst_vm); ++i) {
+      if constexpr (UseFP4ToBF16LookupTable) {
+        fp4tobf16_lookup_table_convert(src_vm(_, i), dst_vm(_, i));
+      } else {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    }
+  }
+
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class Params, class TensorStorage, class... Ts>
+  CUTLASS_DEVICE static auto partition_extra_tma_inputs(Params const& mainloop_params,
+                                                        cute::tuple<Ts...> const& load_inputs,
+                                                        TensorStorage& shared_tensors,
+                                                        uint2 const& cluster_local_block_id,
+                                                        int const m_coord, int const l_coord) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);  // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);  // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);  // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for input partitioning.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma, class TensorStorage>
+  CUTLASS_DEVICE static auto partition_extra_mma_info(ThreadMma const& mma_thread_slice,
+                                                      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (UseScaleLookupTable) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      return cute::make_tuple(tCsS, tCrS);
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).layout());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(
+            mma_thread_slice.partition_fragment_A(sZ(_, _, Int<0>{})).layout());
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE static auto retile_extra_mma_info(TiledMma const& tiled_mma,
+                                                   cute::tuple<Ts...>& partitioned_extra_info,
+                                                   int const warp_group_thread_idx) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view =
+          smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view =
+            smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+};
+
+}  // namespace cutlass::gemm::collective::detail
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
new file mode 100644
index 000000000..a11451764
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Returns the maximum number of smem tiles that can be used with a given smem capacity, or
+// overrides with manual count.
+template <int CapacityBytes, class ElementA, class ElementB, class TileShapeMNK, bool SwapAB,
+          int carveout_bytes>
+constexpr int compute_stage_count_or_override_gated(
+    StageCountAutoCarveout<carveout_bytes> stage_count) {
+  // 32 bytes to account for barriers etc.
+  constexpr int stage_barrier_bytes = 32;
+  constexpr int a_bits = static_cast<int>(sizeof_bits<ElementA>::value);
+  constexpr int b_bits = static_cast<int>(sizeof_bits<ElementB>::value);
+  constexpr int stage_bytes = [&]() -> int {
+    if constexpr (SwapAB) {
+      return (a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{}) * 2) / 8 +
+             (b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) / 8 + stage_barrier_bytes;
+    } else {
+      return (a_bits * size<0>(TileShapeMNK{}) * size<2>(TileShapeMNK{})) / 8 +
+             (b_bits * size<1>(TileShapeMNK{}) * size<2>(TileShapeMNK{}) * 2) / 8 +
+             stage_barrier_bytes;
+    }
+  }();
+
+  return (CapacityBytes - carveout_bytes) / stage_bytes;
+}
+
+}  // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_SS
+template <class ElementA, class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
+          int AlignmentB, class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType,
+          template <class /* ElementCompute */> class Activation, bool SwapAB>
+struct CollectiveBuilderGated<
+    arch::Sm90, arch::OpClassTensorOp, ElementA, GmemLayoutA, AlignmentA, ElementB, GmemLayoutB,
+    AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType, Activation, SwapAB,
+    cute::enable_if_t<
+        (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative>) &&
+        not detail::is_use_rmem_A<ElementA, GmemLayoutA, ElementB, GmemLayoutB>()>> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+
+  static constexpr bool IsArrayOfPointersGemm =
+      (cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative>);
+  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
+  static_assert(!IsFP8Input || (IsFP8Input && !IsArrayOfPointersGemm),
+                "Kernel[Array/Group]TmaWarpSpecializedCooperative is only compatible with FP8 "
+                "FastAccum version right now\n");
+
+  // For fp32 types, map to tf32 MMA value type
+  using MmaElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using MmaElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  static constexpr cute::GMMA::Major GmmaMajorA =
+      detail::gmma_ss_tag_to_major_A<MmaElementA, GmemLayoutA>();
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      detail::gmma_ss_tag_to_major_B<MmaElementB, GmemLayoutB>();
+
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+          IsArrayOfPointersGemm,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::ss_op_selector<MmaElementA, MmaElementB, ElementAccumulator, TileShape_MNK,
+                                 GmmaMajorA, GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::ss_smem_selector<GmmaMajorA, MmaElementA,
+                                        decltype(cute::get<0>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB =
+      decltype(detail::ss_smem_selector<GmmaMajorB, MmaElementB,
+                                        decltype(cute::get<1>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_gated<detail::sm90_smem_capacity_bytes, MmaElementA,
+                                                    MmaElementB, TileShape_MNK, SwapAB>(
+          StageCountType{});
+  using DispatchPolicy = cute::conditional_t<
+      IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      /* For FP8 use a separate mainloop compared to other datatypes */
+      cute::conditional_t<IsFP8Input,
+                          MainloopSm90TmaGmmaWarpSpecializedFP8<PipelineStages, ClusterShape_MNK,
+                                                                KernelScheduleType>,
+                          MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK,
+                                                             KernelScheduleType>>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp =
+      CollectiveMmaGated<DispatchPolicy, TileShape_MNK, ElementA, TagToStrideA_t<GmemLayoutA>,
+                         ElementB, TagToStrideB_t<GmemLayoutB>, TiledMma, GmemTiledCopyA,
+                         SmemLayoutAtomA, SmemCopyAtomA, cute::identity, GmemTiledCopyB,
+                         SmemLayoutAtomB, SmemCopyAtomB, cute::identity, Activation, SwapAB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_FP8_FAST_ACCUM_SS
+template <class ElementA, class GmemLayoutA, int AlignmentA, class ElementB, class GmemLayoutB,
+          int AlignmentB, class ElementAccumulator, class TileShape_MNK, class ClusterShape_MNK,
+          class StageCountType, class KernelScheduleType,
+          template <class /* ElementCompute */> class Activation, bool SwapAB>
+struct CollectiveBuilderGated<
+    arch::Sm90, arch::OpClassTensorOp, ElementA, GmemLayoutA, AlignmentA, ElementB, GmemLayoutB,
+    AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType,
+    KernelScheduleType, Activation, SwapAB,
+    cute::enable_if_t<
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpongFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum> ||
+        cute::is_same_v<KernelScheduleType,
+                        KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>>> {
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Not meet TMA alignment requirement yet\n");
+  static_assert(detail::is_input_fp8<ElementA, ElementB>(),
+                "Only FP8 datatypes are compatible with these kernel schedules\n");
+  // Dispatch TN fp8 kernels only to TMA warp specialized FP8 builder
+  static_assert(!detail::is_use_rmem_A<ElementA, GmemLayoutA, ElementB, GmemLayoutB>(),
+                "Not supported for fp8 non-TN warp specialized kernels yet\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+
+  static constexpr cute::GMMA::Major GmmaMajorA =
+      detail::gmma_ss_tag_to_major_A<ElementA, GmemLayoutA>();
+  static constexpr cute::GMMA::Major GmmaMajorB =
+      detail::gmma_ss_tag_to_major_B<ElementB, GmemLayoutB>();
+
+  static constexpr bool IsArrayOfPointersGemm =
+      (cute::is_same_v<KernelScheduleType,
+                       KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum>);
+  using AtomLayoutMNK = cute::conditional_t<
+      cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperativeFP8FastAccum> ||
+          IsArrayOfPointersGemm,
+      Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::ss_op_selector<ElementA, ElementB, ElementAccumulator, TileShape_MNK, GmmaMajorA,
+                                 GmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::ss_smem_selector<GmmaMajorA, ElementA,
+                                        decltype(cute::get<0>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutAtomB =
+      decltype(detail::ss_smem_selector<GmmaMajorB, ElementB,
+                                        decltype(cute::get<1>(TileShape_MNK{})),
+                                        decltype(cute::get<2>(TileShape_MNK{}))>());
+
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_gated<detail::sm90_smem_capacity_bytes, ElementA,
+                                                    ElementB, TileShape_MNK, SwapAB>(
+          StageCountType{});
+  using DispatchPolicy = cute::conditional_t<
+      IsArrayOfPointersGemm,
+      MainloopSm90ArrayTmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      MainloopSm90TmaGmmaWarpSpecialized<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
+
+  using SmemCopyAtomA = void;
+  using SmemCopyAtomB = void;
+
+  using CollectiveOp =
+      CollectiveMmaGated<DispatchPolicy, TileShape_MNK, ElementA, TagToStrideA_t<GmemLayoutA>,
+                         ElementB, TagToStrideB_t<GmemLayoutB>, TiledMma, GmemTiledCopyA,
+                         SmemLayoutAtomA, SmemCopyAtomA, cute::identity, GmemTiledCopyB,
+                         SmemLayoutAtomB, SmemCopyAtomB, cute::identity, Activation, SwapAB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl
new file mode 100644
index 000000000..8d4710fdd
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS Mixed Scaled GEMM
+template <class ElementPairA_, class GmemLayoutATag_, int AlignmentA, class ElementPairB_,
+          class GmemLayoutBTag_, int AlignmentB, class ElementAccumulator, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType, class KernelScheduleType>
+struct CollectiveBuilderInterleaved<
+    arch::Sm90, arch::OpClassTensorOp, ElementPairA_, GmemLayoutATag_, AlignmentA, ElementPairB_,
+    GmemLayoutBTag_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+    StageCountType, KernelScheduleType,
+    cute::enable_if_t<(cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+                       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+                       cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>)>> {
+ private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementPairA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementPairB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementPairA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementPairB_>;
+  static constexpr bool NeitherIsTuple =
+      !cute::is_tuple<ElementPairA_>::value && !cute::is_tuple<ElementPairB_>::value;
+
+ public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementPairA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementPairB_>;
+  static_assert(cute::is_tuple<ElementPairA_>::value ^ cute::is_tuple<ElementPairB_>::value ||
+                    (NeitherIsTuple &&
+                     (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value)),
+                "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  using GmemLayoutATag = GmemLayoutATag_;
+  using GmemLayoutBTag = GmemLayoutBTag_;
+
+  using ElementPairA =
+      cute::conditional_t<IsANarrow && NeitherIsTuple, cute::tuple<ElementA>, ElementPairA_>;
+  using ElementPairB =
+      cute::conditional_t<!IsANarrow && NeitherIsTuple, cute::tuple<ElementB>, ElementPairB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB =
+      detail::is_warpspecialized_transpose_B<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag,
+                                             KernelScheduleType>();
+  static_assert(!IsWarpSpecializedTransposeB, "Mixed input GEMM does not support WS transpose B.");
+
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to RF and we must
+  // swap the operands.
+  static constexpr bool SwapAB = !IsATransformed;
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  using ElementMma = cute::conditional_t<IsATransformed, ElementB, ElementA>;
+  using AtomLayoutMNK =
+      cute::conditional_t<cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative>,
+                          Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<ElementMma, ElementMma, ElementAccumulator, TileShape_MNK,
+                                 GMMA::Major::K, TiledMmaGmmaMajorB>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorA, ElementA, decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  using RealElementA = cute::conditional_t<SwapAB, ElementB, ElementA>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementA, ElementB>;
+  static constexpr int PipelineStages =
+      detail::compute_stage_count_or_override_single_affine_transformed_input<
+          detail::sm90_smem_capacity_bytes, RealElementA, RealElementB, ElementScale, ElementZero,
+          TileShape_MNK>(StageCountType{});
+
+  using SmemCopyAtomA =
+      cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB =
+      cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<PipelineStages, ClusterShape_MNK,
+                                                        KernelScheduleType>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before
+  // MMA.
+  using StrideA = TagToStrideA_t<GmemLayoutATag>;
+  using StrideB = TagToStrideB_t<GmemLayoutBTag>;
+
+  using CollectiveOp =
+      CollectiveMmaInterleaved<DispatchPolicy, TileShape_MNK, ElementPairA, StrideA, ElementPairB,
+                               StrideB, TiledMma, GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA,
+                               cute::identity, GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB,
+                               cute::identity>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
new file mode 100644
index 000000000..b5acd12e2
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+
+// SM90 Collective Builders should be used only starting CUDA 12.0
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GMMA_TMA_WS_RS
+template <class ElementA_, class GmemLayoutATag_, int AlignmentA, class ElementB_,
+          class GmemLayoutBTag_, int AlignmentB, class ElementAccumulator, class TileShape_MNK,
+          class ClusterShape_MNK, class StageCountType, class KernelScheduleType>
+struct CollectiveBuilderMixedInput<
+    arch::Sm90, arch::OpClassTensorOp, ElementA_, GmemLayoutATag_, AlignmentA, ElementB_,
+    GmemLayoutBTag_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK,
+    StageCountType, KernelScheduleType,
+    cute::enable_if_t<
+        (cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecialized> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedPingpong> ||
+         cute::is_same_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative> ||
+         cute::is_same_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedPingpong>) &&
+        (detail::is_use_rmem_A<ElementA_, GmemLayoutATag_, ElementB_, GmemLayoutBTag_>() ||
+         // ConvertAndScale and ConvertAndScaleWithZero
+         cute::is_tuple<ElementA_>::value || cute::is_tuple<ElementB_>::value ||
+         // DirectConvert
+         sizeof_bits<ElementA_>::value != sizeof_bits<ElementB_>::value)>> {
+ private:
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementA_>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementB_>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementA_>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementB_>;
+  static constexpr bool NeitherIsTuple =
+      !cute::is_tuple<ElementA_>::value && !cute::is_tuple<ElementB_>::value;
+  // Determine if mixed input types.
+  static constexpr bool IsMixedInput =
+      cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementA_>> !=
+      cute::sizeof_bits_v<detail::deduce_mixed_width_dtype_t<0, ElementB_>>;
+  static constexpr bool IsArrayOfPointersGemm =
+      cute::is_any_of_v<KernelScheduleType, KernelPtrArrayTmaWarpSpecializedCooperative,
+                        KernelPtrArrayTmaWarpSpecializedPingpong>;
+  static_assert(IsMixedInput || !IsArrayOfPointersGemm,
+                "Only mixed input grouped RS GEMM is supported.");
+
+ public:
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementA_>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementB_>;
+
+  static_assert(!IsMixedInput ||
+                    (cute::is_tuple<ElementA_>::value ^ cute::is_tuple<ElementB_>::value ||
+                     (NeitherIsTuple &&
+                      (sizeof_bits<ElementA>::value != sizeof_bits<ElementB>::value))),
+                "Either A OR B must be a tuple or the widths of A and B must be different.");
+
+  static constexpr bool IsANarrow = sizeof_bits<ElementA>::value < sizeof_bits<ElementB>::value;
+
+  template <class T>
+  static auto get_stride(T const& t) {
+    if constexpr (not cute::is_layout<cute::remove_pointer_t<T>>::value) {
+      return t;
+    } else {
+      if constexpr (cute::is_pointer_v<T>) {
+        return &cute::stride(*t);
+      } else {
+        return cute::stride(t);
+      }
+    }
+  }
+
+  using GmemLayoutATag = decltype(get_stride(GmemLayoutATag_{}));
+  using GmemLayoutBTag = decltype(get_stride(GmemLayoutBTag_{}));
+
+  using ElementPairA = cute::conditional_t<IsMixedInput && IsANarrow && NeitherIsTuple,
+                                           cute::tuple<ElementA>, ElementA_>;
+  using ElementPairB = cute::conditional_t<IsMixedInput && !IsANarrow && NeitherIsTuple,
+                                           cute::tuple<ElementB>, ElementB_>;
+
+  static constexpr bool IsATransformed = cute::is_tuple<ElementPairA>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+
+  static_assert(is_static<TileShape_MNK>::value);
+  static_assert(is_static<ClusterShape_MNK>::value);
+  static_assert(
+      detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
+      "Should meet TMA alignment requirement\n");
+#ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Unsupported Toolkit for SM90 Collective Builder\n");
+#endif
+  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_rs_tag_to_major_A<GmemLayoutATag>();
+  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_rs_tag_to_major_B<GmemLayoutBTag>();
+  // If A is scaled, then we don't need to swap. Otherwise, we must ensure B goes to rmem and we
+  // must swap the operands.
+  static constexpr bool SwapAB =
+      IsMixedInput ? !IsATransformed
+                   : detail::is_swapAB<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>();
+  static constexpr bool IsWarpSpecializedTransposeB =
+      detail::is_warpspecialized_transpose_B<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag,
+                                             KernelScheduleType>();
+  static_assert(!IsMixedInput || !IsWarpSpecializedTransposeB,
+                "Mixed input GEMM does not support WS transpose B.");
+
+  // When we relax the above assertion, we must handle setting the tile mma GmmaMajorB correctly.
+  static constexpr cute::GMMA::Major TiledMmaGmmaMajorB = SwapAB ? GmmaMajorA : GmmaMajorB;
+
+  // For fp32 types, map to tf32 MMA value type.
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // Handle mixed dtypes and MMA.
+  using RealElementA = cute::conditional_t<SwapAB, ElementBMma, ElementAMma>;
+  using RealElementB = cute::conditional_t<SwapAB, ElementAMma, ElementBMma>;
+  using RealElementAMma = cute::conditional_t<IsMixedInput, RealElementB, RealElementA>;
+  // Always the same for element B.
+  using RealElementBMma = RealElementB;
+
+  static_assert(!IsMixedInput || TiledMmaGmmaMajorB == GMMA::Major::K ||
+                    sizeof_bits<RealElementB>::value == 16,
+                "Mixed input GEMM does not support MN major layout except for 16bit");
+
+  using AtomLayoutMNK =
+      cute::conditional_t<cute::is_any_of_v<KernelScheduleType, KernelTmaWarpSpecializedCooperative,
+                                            KernelPtrArrayTmaWarpSpecializedCooperative>,
+                          Layout<Shape<_2, _1, _1>>, Layout<Shape<_1, _1, _1>>>;
+
+  using TiledMma = decltype(cute::make_tiled_mma(
+      cute::GMMA::rs_op_selector<RealElementAMma, RealElementBMma, ElementAccumulator,
+                                 TileShape_MNK, GMMA::Major::K, GMMA::Major::K>(),
+      AtomLayoutMNK{}));
+
+  using GmemTiledCopyA =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+  using GmemTiledCopyB =
+      decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
+
+  using SmemLayoutAtomA =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+  using SmemLayoutAtomB =
+      decltype(detail::rs_smem_selector<
+               GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{})), IsWarpSpecializedTransposeB>());
+
+  static constexpr size_t SmemAlignmentA =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutAtomA{});
+  static constexpr size_t SmemAlignmentB =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutAtomB{});
+  static constexpr int SmemAlignment = static_cast<int>(cute::max(SmemAlignmentA, SmemAlignmentB));
+
+  // Handle mixed dtype array GEMM's size of tensor map storage.
+  static constexpr size_t TensorMapStorage = sizeof(cute::TmaDescriptor) * size_t(IsMixedInput) * 4;
+  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
+  static constexpr int Sm90ReducedSmemCapacityBytes =
+      detail::sm90_smem_capacity_bytes - KernelSmemCarveout;
+
+  static constexpr int PipelineStages =
+      IsMixedInput
+          ? (IsArrayOfPointersGemm
+                 ? detail::compute_stage_count_or_override_single_affine_transformed_input<
+                       Sm90ReducedSmemCapacityBytes, RealElementA, RealElementB, ElementScale,
+                       ElementZero, TileShape_MNK, StageCountType::bytes, SmemAlignment>(
+                       StageCountType{})
+                 : detail::compute_stage_count_or_override_single_affine_transformed_input<
+                       detail::sm90_smem_capacity_bytes, RealElementA, RealElementB, ElementScale,
+                       ElementZero, TileShape_MNK, StageCountType::bytes, SmemAlignment>(
+                       StageCountType{}))
+          : detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes, ElementAMma,
+                                                    ElementBMma, TileShape_MNK,
+                                                    StageCountType::bytes, SmemAlignment>(
+                StageCountType{});
+
+  using DispatchPolicy = cute::conditional_t<
+      IsMixedInput,
+      cute::conditional_t<IsArrayOfPointersGemm,
+                          MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<
+                              PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+                          MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<
+                              PipelineStages, ClusterShape_MNK, KernelScheduleType>>,
+      MainloopSm90TmaGmmaRmemAWarpSpecialized<PipelineStages, ClusterShape_MNK,
+                                              KernelScheduleType>>;
+
+  using SmemCopyAtomA =
+      cute::conditional_t<SwapAB, void, Copy_Atom<cute::AutoVectorizingCopy, ElementA>>;
+  using SmemCopyAtomB =
+      cute::conditional_t<SwapAB, Copy_Atom<cute::AutoVectorizingCopy, ElementB>, void>;
+
+  // We pack the scale data with the operand that will be optionally scaled and converted before
+  // MMA.
+  using StrideA =
+      cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutATag_>>::value,
+                          GmemLayoutATag_, TagToStrideA_t<GmemLayoutATag>>;
+  using StrideB =
+      cute::conditional_t<cute::is_layout<cute::remove_pointer_t<GmemLayoutBTag_>>::value,
+                          GmemLayoutBTag_, TagToStrideB_t<GmemLayoutBTag>>;
+
+  using CollectiveOp =
+      CollectiveMmaArrayMixedInput<DispatchPolicy, TileShape_MNK, ElementPairA, StrideA,
+                                   ElementPairB, StrideB, TiledMma, GmemTiledCopyA, SmemLayoutAtomA,
+                                   SmemCopyAtomA, cute::identity, GmemTiledCopyB, SmemLayoutAtomB,
+                                   SmemCopyAtomB, cute::identity>;
+
+  static_assert(SmemAlignment == static_cast<int>(cute::max(CollectiveOp::SmemAlignmentA,
+                                                            CollectiveOp::SmemAlignmentB)));
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
new file mode 100644
index 000000000..61c9ddbce
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_gated.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_gated.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, template <class /* ElementCompute */> class Activation,
+          bool SwapAB = false, class Enable = void>
+struct CollectiveBuilderGated {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_gated.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp
new file mode 100644
index 000000000..23c9f5bed
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, class Enable = void>
+struct CollectiveBuilderInterleaved {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_interleaved.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
new file mode 100644
index 000000000..e3e3f3459
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp"
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ArchTag, class OpClass, class ElementA, class GmemLayoutA, int AlignmentA,
+          class ElementB, class GmemLayoutB, int AlignmentB, class ElementAccumulator,
+          class TileShape_MNK, class ClusterShape_MNK, class StageCountType,
+          class KernelScheduleType, class Enable = void>
+struct CollectiveBuilderMixedInput {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/builders/sm90_gmma_builder_mixed_input.inl"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
new file mode 100644
index 000000000..7ca25def0
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_array_mixed_input.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB>
+struct CollectiveMmaArrayMixedInput {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
new file mode 100644
index 000000000..94e82be03
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_gated.hpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB,
+          template <class /* ElementCompute */> class Activation, bool SwapAB = false>
+struct CollectiveMmaGated {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp
new file mode 100644
index 000000000..3f3266c8e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_interleaved.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class DispatchPolicy, class TileShape, class ElementA, class StrideA, class ElementB,
+          class StrideB, class TiledMma, class GmemTiledCopyA, class SmemLayoutAtomA,
+          class SmemCopyAtomA, class TransformA, class GmemTiledCopyB, class SmemLayoutAtomB,
+          class SmemCopyAtomB, class TransformB>
+struct CollectiveMmaInterleaved {
+  static_assert(cutlass::detail::dependent_false<ElementA>,
+                "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
new file mode 100644
index 000000000..6964886a8
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input_.hpp
@@ -0,0 +1,1474 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass_extensions/detail/collective/mixed_input_utils.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule_, class TileShape_,
+          class ElementAOptionalTuple, class StrideA_, class ElementBOptionalTuple, class StrideB_,
+          class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
+          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_,
+          class TransformB_>
+struct CollectiveMmaArrayMixedInput<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_, ElementAOptionalTuple, StrideA_, ElementBOptionalTuple, StrideB_, TiledMma_,
+    GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+    SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> {
+ public:
+  enum class ConversionMode { DirectConvert, ConvertAndScale, ConvertAndScaleWithZero };
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+ private:
+  template <class T>
+  friend struct detail::MixedGroupedGemmInputUtils;
+  using CollectiveType =
+      CollectiveMmaArrayMixedInput<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
+                                   ElementBOptionalTuple, StrideB_, TiledMma_, GmemTiledCopyA_,
+                                   SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+                                   SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_>;
+  using Utils = detail::MixedGroupedGemmInputUtils<CollectiveType>;
+
+  //
+  // Type Aliases
+  //
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+ public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^
+                    cute::is_tuple<ElementBOptionalTuple>::value,
+                "Either A OR B must be a tuple. It must take the from {ElementOperand, "
+                "[ElementScale], [ElementZero]}. Inputs "
+                "in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when
+  // the scale / zero is void.
+  using NonVoidElementScale =
+      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>,
+                                                 cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(
+      (IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() ||
+                          is_layout<StrideA>::value || is_layout<InternalStrideA>::value)) ||
+          (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() ||
+                               is_layout<StrideB>::value || is_layout<InternalStrideB>::value)),
+      "The transformed type must be K-major.");
+
+  static_assert((IsATransformed && (sizeof(ElementB) == 2)) ||
+                    (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                    ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value ||
+                      is_layout<InternalStrideA>::value) &&
+                     (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value ||
+                      is_layout<InternalStrideB>::value)),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+                "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  static constexpr bool IsMXFP4 = cute::is_same_v<ElementA, cutlass::float_e2m1_t>;
+  // Group size 128 for int4 weights
+  // Group size 32 for mxfp4 weights
+  static constexpr int ScalingGroupSize =
+      IsMXFP4 ? detail::mxfp4_group_size : detail::int4_group_size;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale =
+      uint_bit_t<sizeof_bits_v<NonVoidElementScale>>;  // in case we have array. translating to uint
+                                                       // to satisfy tma descriptor's specialization
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale =
+      Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape =
+      decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  /// Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomA{}, select<0, 2>(TileShape{}), InternalSwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(
+      SwappedSmemLayoutAtomB{}, select<1, 2>(TileShape{}), InternalSwappedStrideB{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, NonVoidStrideScale>(),
+                          Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop
+  // iteration. We must also handle updating the pipeline transaction bytes on the fly.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+ public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales =
+      KernelConversionMode == ConversionMode::ConvertAndScale ||
+      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable =
+      KernelConversionMode == ConversionMode::ConvertAndScale &&
+      cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr bool UseFP4ToBF16LookupTable =
+      KernelConversionMode == ConversionMode::ConvertAndScale &&
+      cute::is_same_v<ElementA, cutlass::float_e2m1_t> &&
+      cute::is_same_v<ElementB, cutlass::bfloat16_t>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA)
+      cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB)
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    struct TensorMapStorage {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_scale;
+      cute::TmaDescriptor smem_tensormap_zero;
+    };
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // kernel Arguments
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementScale const** ptr_S = nullptr;
+    NonVoidStrideScale const* dS{};
+    int chunk_size = 0;
+    ElementZero const** ptr_Z = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(
+        repeat_like(InternalSwappedStrideA{}, int32_t(0)), InternalSwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(
+        repeat_like(InternalSwappedStrideB{}, int32_t(0)), InternalSwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)),
+                    LayoutA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)),
+                    LayoutB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+
+    using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    void* tensormaps;
+    SwappedElementA const** ptr_A;
+    SwappedStrideA ptr_dA;
+    SwappedElementB const** ptr_B;
+    SwappedStrideB ptr_dB;
+    NonVoidElementScale const** ptr_S;
+    NonVoidStrideScale const* dS;
+    NonVoidElementZero const** ptr_Z;
+    int64_t scale_k;
+    int chunk_size;
+    int reload_factor = (chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape problem_shapes,
+                                                  Arguments const& args, void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create
+    // tensormap/tma desc. These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+
+    if constexpr (SwapAB) {
+      init_M = get<1>(init_shape);
+      init_N = get<0>(init_shape);
+    }
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    SwappedElementA const* ptr_A_first_batch;
+    SwappedElementB const* ptr_B_first_batch;
+    SwappedStrideA ptr_dA;
+    SwappedStrideB ptr_dB;
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+    } else {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+    }
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      if constexpr (not SwapAB) {
+        ptr_dA = args.dA;
+        ptr_dB = args.dB;
+      } else {
+        ptr_dA = args.dB;
+        ptr_dB = args.dA;
+      }
+      dA = InternalSwappedStrideA{};
+      if constexpr (is_layout<InternalSwappedStrideA>::value) {
+        dA = make_layout(transform_leaf(dA.shape(),
+                                        [](auto x) {
+                                          if constexpr (not is_static_v<decltype(x)>) {
+                                            return static_cast<decltype(x)>(1);
+                                          } else {
+                                            return x;
+                                          }
+                                        }),
+                         dA.stride());
+      }
+      dB = InternalSwappedStrideB{};
+    } else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      if constexpr (not SwapAB) {
+        dA = args.dA;
+        dB = args.dB;
+      } else {
+        dA = args.dB;
+        dB = args.dA;
+      }
+      ptr_dA = SwappedStrideA{};
+      ptr_dB = SwappedStrideB{};
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch,
+                                  detail::get_gmem_layout(make_shape(init_M, init_K, mock_L), dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch,
+                                  detail::get_gmem_layout(make_shape(init_N, init_K, mock_L), dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    void* tensormaps = workspace;
+    auto args_setup = [&](auto ptr_A, auto ptr_B, int64_t scale_k = 0, int chunk_size = 0,
+                          int reload_factor = 1) -> Params {
+      return {tma_load_a,
+              tma_load_b,
+              TmaTransactionBytes,
+              tma_load_scale,
+              tma_load_zero,
+              tensormaps,
+              reinterpret_cast<SwappedElementA const**>(ptr_A),
+              ptr_dA,
+              reinterpret_cast<SwappedElementB const**>(ptr_B),
+              ptr_dB,
+              reinterpret_cast<NonVoidElementScale const**>(args.ptr_S),
+              args.dS,
+              reinterpret_cast<NonVoidElementZero const**>(args.ptr_Z),
+              scale_k,
+              chunk_size,
+              reload_factor,
+              dA,
+              dB};
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return SwapAB ? args_setup(args.ptr_B, args.ptr_A) : args_setup(args.ptr_A, args.ptr_B);
+    } else if constexpr (ModeHasScales) {
+      auto fake_scale_k = 1;
+      ElementScale const* ptr_S = reinterpret_cast<ElementScale const*>(args.ptr_S);
+      StrideScale dS{};
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S),
+                                        make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{}, tensor_scale, SmemLayoutScale{}(_, _, cute::Int<0>{}),
+          ScaleTileShape{}, _1{});  // mcast along N mode for this M load, if any
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return SwapAB
+                   ? args_setup(args.ptr_B, args.ptr_A, fake_scale_k, args.chunk_size,
+                                (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                   : args_setup(
+                         args.ptr_A, args.ptr_B, fake_scale_k, args.chunk_size,
+                         (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        ElementZero const* ptr_Z = reinterpret_cast<ElementZero const*>(args.ptr_Z);
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z),
+                                         make_layout(make_shape(init_M, fake_scale_k, mock_L), dS));
+        tma_load_zero = make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
+                                      SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+                                      _1{});  // mcast along N mode for this M load, if any
+        return SwapAB
+                   ? args_setup(args.ptr_B, args.ptr_A, fake_scale_k, args.chunk_size,
+                                (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                   : args_setup(
+                         args.ptr_A, args.ptr_B, fake_scale_k, args.chunk_size,
+                         (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t get_workspace_size(ProblemShape const& problem_shape, Arguments const& args,
+                                   int sm_count) {
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Calculating workspace size
+    auto calculate_workspace_size = [SizeOfCuTensorMap, sm_count](uint32_t num_input_tensors) {
+      return num_input_tensors * SizeOfCuTensorMap * sm_count;
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies
+      return calculate_workspace_size(2);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies, followed by scale tensormap copies
+      return calculate_workspace_size(3);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B
+      // tensormap copies, followed by scale and zeros tensormap copies
+      return calculate_workspace_size(4);
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in get_workspace_size.");
+    }
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status initialize_workspace(ProblemShape const& problem_shape,
+                                              Arguments const& args, void* workspace,
+                                              cudaStream_t stream,
+                                              CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool can_implement(ProblemShape problem_shapes,
+                                                Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M, N, K, L] = problem_shape_MNKL;
+        auto get_stride = [](auto stride) {
+          if constexpr (cute::is_pointer_v<cute::decay_t<decltype(stride)>>) {
+            return *stride;
+          } else {
+            return stride;
+          }
+        };
+        auto dA = get_stride(args.dA);
+        auto dB = get_stride(args.dB);
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                 detail::get_gmem_layout(cute::make_shape(M, K, L), dA));
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                 detail::get_gmem_layout(cute::make_shape(N, K, L), dB));
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          implementable = implementable && (args.ptr_S == nullptr);
+          implementable = implementable && (args.ptr_Z == nullptr);
+        } else if constexpr (ModeHasScales) {
+          int const scale_mn = SwapAB ? N : M;
+          int const scale_k = (K + args.chunk_size - 1) / args.chunk_size;
+          constexpr int min_tma_aligned_elements_scale =
+              tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+          implementable =
+              implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(
+                                   cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+          implementable = implementable &&
+                          (args.chunk_size == K || ((args.chunk_size % size<2>(TileShape{})) == 0));
+          implementable = implementable && args.chunk_size != 0;
+          implementable = implementable && (args.ptr_S != nullptr);
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            implementable = implementable && (args.ptr_Z == nullptr);
+          } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            constexpr int min_tma_aligned_elements_zero =
+                tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+            implementable =
+                implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(
+                                     cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+            implementable = implementable && (args.ptr_Z != nullptr);
+          } else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                          "Conversion mode not handled in can_implement.");
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in can_implement.");
+        }
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes =
+      TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(M, K, mock_L), mainloop_params.dA)));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(
+        shape(detail::get_gmem_layout(make_shape(N, K, mock_L), mainloop_params.dB)));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } else if constexpr (ModeHasScales) {
+      // The real scale_k that actually works
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
+          make_shape(M, scale_k, L));  // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{},
+                                 make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
+            make_shape(M, scale_k, L));  // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{},
+                                   make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in load_init.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <class... Ts, class... TMs, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write, cute::tuple<Ts...> const& load_inputs,
+                           cute::tuple<TMs...> const& input_tensormaps, BlockCoord const& blk_coord,
+                           KTileIterator k_tile_iter, int k_tile_count, int thread_idx,
+                           uint32_t block_rank_in_cluster, TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof...(Ts) == 2, "Direct convert needs two inputs");
+      static_assert(sizeof...(TMs) == 2, "Direct convert needs two tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof...(Ts) == 3, "Scaled convert needs three inputs");
+      static_assert(sizeof...(TMs) == 3, "Scaled convert needs three tensormaps");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof...(Ts) == 4, "Scaled and zero convert needs four inputs");
+      static_assert(sizeof...(TMs) == 4, "Scaled and zero convert needs four tensormaps");
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                             SmemLayoutB{});                  // (BLK_N,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = as_position_independent_swizzle_tensor(sB_);  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                    block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(
+        mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+      }
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      } else if constexpr (ModeHasScales) {
+        // scale copy
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we
+        // don't modify tma transaction bytes on the fly. We must do a ceiling divide here to
+        // correctly handle with chunk_size == K. In that case, we don't require that K is a
+        // multiple of the threadblock tile K
+        int const scale_load_k = *k_tile_iter / 1;
+        // const int scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will
+        // always be 0 when chunk_size == K.
+        if (cute::elect_one_sync()) {
+          copy(mainloop_params.tma_load_scale.with(get<2>(input_tensormaps), *tma_barrier,
+                                                   mcast_mask_s),
+               tSgS(_, _, _, scale_load_k), tSsS(_, _, _, write_stage));
+        }
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) {
+            copy(mainloop_params.tma_load_zero.with(get<3>(input_tensormaps), *tma_barrier,
+                                                    mcast_mask_s),
+                 tZgZ(_, _, _, scale_load_k), tZsZ(_, _, _, write_stage));
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled for TMA copy op.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for TMA copy op.");
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2,
+                  "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2,
+                  "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(
+        !cute::is_void_v<SwappedSmemCopyAtomA>,
+        "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SwappedSmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA{});                  // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                      size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be "
+                  "NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout =
+        make_layout(Int<MmaWarpGroups>{}, Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma =
+        mma_thread_slice.partition_fragment_A(sA(_, _, Int<0>{}));  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = [&] {
+      if constexpr (not is_layout<InternalSwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      } else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_, _, _, Int<0>{}));
+      }
+    }();
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+    // tCrB is just a view of the tensor tCsB
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA_load);  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info =
+        Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));      // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));      // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));           // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));               // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    multiply_add<ElementAccumulator> fma;
+
+    constexpr int NumMMAsPerChunk = ScalingGroupSize / cute::get<0, 1>(tCsB.shape())();
+    constexpr int NumChunksPerTileK = cute::size<1>(sA.shape())() / ScalingGroupSize;
+    cute::array<decltype(make_fragment_like(accum)), NumChunksPerTileK> intermediate_array;
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // First k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                             copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) {
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 1, read_stage);
+      }
+
+      // src: tCrA_load, dst: tCrA_mma
+      Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                     intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+          warpgroup_commit_batch();
+
+          if (k_block < K_BLOCK_MAX - 2) {
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          if (k_block < K_BLOCK_MAX - 1) {
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+
+      warpgroup_wait<0>();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+        warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+        // Apply the group-wise scaling
+        // tCrS  ((4, _2, _2), MMA_M, _1)
+        // accum ((2, _2, _2), MMA_M, _1)
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+          for (int m = 0; m < size<0, 1>(accum); m++) {
+            for (int n = 0; n < size<0, 2>(accum); n++) {
+              for (int e = 0; e < size<0, 0>(accum); e++) {
+                auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                if (chunk_id_ == 0) {
+                  accum(accum_coord) = intermediate_array[chunk_id_](accum_coord) *
+                                       static_cast<float>(tCrS(scale_coord)[0]);
+                } else {
+                  accum(accum_coord) =
+                      fma(intermediate_array[chunk_id_](accum_coord),
+                          static_cast<float>(tCrS(scale_coord)[chunk_id_]), accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A
+        // registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 0, smem_pipe_read.index());
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                               copy_partitions_extra_info, 1, smem_pipe_read.index());
+
+        Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int chunk_id = 0; chunk_id < NumChunksPerTileK; ++chunk_id) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_id = 0; mma_id < NumMMAsPerChunk; ++mma_id) {
+          int k_block = chunk_id * NumMMAsPerChunk + mma_id;
+
+          warpgroup_arrive();
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                     intermediate_array[chunk_id]);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+          warpgroup_commit_batch();
+
+          if (k_block == K_BLOCK_MAX - 1) {
+            pipeline.consumer_release(
+                smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+            ++smem_pipe_release;
+          }
+
+          if (k_block == 0) {
+            barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+          }
+
+          if (k_block == K_BLOCK_MAX - 1) {
+            // The last k_block
+
+            warpgroup_wait<0>();
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int chunk_id_ = 0; chunk_id_ < NumChunksPerTileK; ++chunk_id_) {
+              warpgroup_fence_operand(intermediate_array[chunk_id_]);
+
+              // Apply the group-wise scaling
+              auto tCrS = cute::get<1>(partitioned_extra_info);
+              for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+                for (int m = 0; m < size<0, 1>(accum); m++) {
+                  for (int n = 0; n < size<0, 2>(accum); n++) {
+                    for (int e = 0; e < size<0, 0>(accum); e++) {
+                      auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                      auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+
+                      accum(accum_coord) =
+                          fma(intermediate_array[chunk_id_](accum_coord),
+                              static_cast<float>(tCrS(scale_coord)[chunk_id_]), accum(accum_coord));
+                    }
+                  }
+                }
+              }
+            }
+
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+            // copy scales when passing k_block=0
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, 0, smem_pipe_read.index());
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                   copy_partitions_extra_info, 1, smem_pipe_read.index());
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, 0);
+          } else {
+            if (k_block < K_BLOCK_MAX - 2) {
+              Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+                                     partitioned_extra_info, copy_partitions_extra_info,
+                                     k_block + 2, read_stage);
+            }
+            Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+          }
+        }
+      }
+    }
+
+    {
+      //
+      // Last k tile
+      //
+      Tensor intermediate = make_fragment_like(accum);
+
+      int read_stage = smem_pipe_read.index();
+
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_, _, k_block), tCrB(_, _, k_block, read_stage),
+                   intermediate);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view, partitioned_extra_info,
+                                 copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::convert_A_kblock(tCrA_load, tCrA_mma, k_block + 1);
+        }
+
+        if ((k_block + 1) % NumMMAsPerChunk == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+          warpgroup_wait<0>();
+          warpgroup_fence_operand(intermediate);
+
+          // Apply the group-wise scaling
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          for (int mma_m = 0; mma_m < size<1>(accum); mma_m++) {
+            for (int m = 0; m < size<0, 1>(accum); m++) {
+              for (int n = 0; n < size<0, 2>(accum); n++) {
+                for (int e = 0; e < size<0, 0>(accum); e++) {
+                  auto accum_coord = make_coord(make_tuple(e, m, n), mma_m, 0);
+                  auto scale_coord = make_coord(make_tuple(0, m, 0), mma_m, 0);
+                  int scale_idx = k_block / NumMMAsPerChunk;
+
+                  accum(accum_coord) =
+                      fma(intermediate(accum_coord),
+                          static_cast<float>(tCrS(scale_coord)[scale_idx]), accum(accum_coord));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    // warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto tensormaps_init(Params const& mainloop_params,
+                                      TensorMapStorage& shared_tensormaps, int32_t sm_count,
+                                      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap =
+        reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_scale = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_zero = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    // Bringing tensormaps from params to smem for modification later
+    Tensor pA_tensormap =
+        make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap =
+        make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor pB_tensormap =
+        make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap =
+        make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    if (cute::elect_one_sync()) {
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor pS_tensormap =
+          make_tensor(mainloop_params.tma_load_scale.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sS_tensormap =
+          make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_scale), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pS_tensormap), recast<uint128_t>(sS_tensormap));
+      }
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      Tensor pZ_tensormap =
+          make_tensor(mainloop_params.tma_load_zero.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sZ_tensormap =
+          make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_zero), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pZ_tensormap), recast<uint128_t>(sZ_tensormap));
+      }
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_init.");
+    }
+
+    __syncwarp();
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale, tma_desc_zero);
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_init.");
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void tensormaps_replace_global_address(TensorMapStorage& shared_tensormaps,
+                                         Params const& mainloop_params, int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                      mainloop_params.ptr_S[next_batch]);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                      mainloop_params.ptr_Z[next_batch]);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_address.");
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by
+  // single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps, Params const& mainloop_params, int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_scale = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_scale = {0, 0, 0, 0, 0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_zero = {1, 1, 1, 1, 1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_zero = {0, 0, 0, 0, 0};
+
+    SwappedElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(
+        ptr_A,
+        detail::get_gmem_layout(make_shape(M, K, Int<1>{}), mainloop_params.ptr_dA[next_group]));
+
+    SwappedElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(
+        ptr_B,
+        detail::get_gmem_layout(make_shape(N, K, Int<1>{}), mainloop_params.ptr_dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, prob_shape_A,
+                                             prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, prob_shape_B,
+                                             prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      NonVoidElementScale const* ptr_S = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+      Tensor tensor_scale =
+          make_tensor(detail::get_logical_ptr(ptr_S), make_shape(M, scale_k, Int<1>{}),
+                      mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_scale, tensor_scale,
+                                               prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      ElementZero const* ptr_Z = nullptr;
+      // auto scale_k = K / mainloop_params.chunk_size;
+      auto scale_k = K / ScalingGroupSize;
+      Tensor tensor_zero =
+          make_tensor(detail::get_logical_ptr(ptr_Z), make_shape(M, scale_k, Int<1>{}),
+                      mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_zero, tensor_zero,
+                                               prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<SwappedElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<SwappedElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_scale) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_zero) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A, prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(
+          shared_tensormaps.smem_tensormap_scale, prob_shape_scale, prob_stride_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                              prob_shape_zero, prob_stride_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+  }
+
+  template <class... TMs, class ProblemShape_MNKL>
+  CUTLASS_DEVICE void tensormaps_perform_update(TensorMapStorage& shared_tensormaps,
+                                                Params const& mainloop_params,
+                                                cute::tuple<TMs...> const& input_tensormaps,
+                                                ProblemShape_MNKL problem_shape_mnkl,
+                                                int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps, mainloop_params, next_batch,
+                                                    problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class... TMs>
+  CUTLASS_DEVICE void tensormaps_cp_fence_release(TensorMapStorage& shared_tensormaps,
+                                                  cute::tuple<TMs...> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      tma_descriptor_cp_fence_release(get<2>(input_tensormaps),
+                                      shared_tensormaps.smem_tensormap_scale);
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      tma_descriptor_cp_fence_release(get<3>(input_tensormaps),
+                                      shared_tensormaps.smem_tensormap_zero);
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_cp_fence_release.");
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class... TMs>
+  CUTLASS_DEVICE void tensormaps_fence_acquire(cute::tuple<TMs...> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+    } else if constexpr (KernelConversionMode != ConversionMode::DirectConvert) {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in tensormaps_fence_acquire.");
+    }
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE InputTensors tensors_perform_update(
+      InputTensors const& input_tensors, [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl, [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 000000000..2974895f5
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,630 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_, class ElementA_,
+          class StrideA_, class ElementB_, class StrideB_, class TiledMma_, class GmemTiledCopyA_,
+          class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_, class GmemTiledCopyB_,
+          class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_,
+          template <class /* ElementCompute */> class Activation_, bool SwapAB_>
+struct CollectiveMmaGated<MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+                          TileShape_, ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_,
+                          GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_,
+                          GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_,
+                          Activation_, SwapAB_> {
+  static constexpr bool isGated = true;
+  static constexpr bool SwapAB = SwapAB_;
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using Activation = Activation_<ElementAccumulator>;
+
+  using ElementAux = cute::conditional_t<SwapAB, ElementA_, ElementB_>;
+  using ValTypeAux =
+      cute::conditional_t<SwapAB, typename TiledMma::ValTypeA, typename TiledMma::ValTypeB>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutAux = cute::conditional_t<SwapAB, SmemLayoutA, SmemLayoutB>;
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementAux = cute::conditional_t<SwapAB, InternalElementA, InternalElementB>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ValTypeAux, cute::cosize_v<SmemLayoutAux>> smem_Aux;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr),
+                    repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr),
+                    repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    using TMA_Aux = cute::conditional_t<SwapAB, TMA_A, TMA_B>;
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Aux tma_load_aux;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M, K, L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N, K, L), args.dB));
+    typename Params::TMA_A tma_load_a =
+        make_tma_copy(GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                      size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    if constexpr (SwapAB) {
+      auto ptr_Aux =
+          reinterpret_cast<InternalElementA const*>(args.ptr_A + size(make_shape(M, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(M, K, L), args.dA));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyA{}, tensor_aux, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+      return {tma_load_a, tma_load_b, tma_load_aux, args.scale_d0, args.scale_d1};
+    } else {
+      auto ptr_Aux =
+          reinterpret_cast<InternalElementB const*>(args.ptr_B + size(make_shape(N, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(N, K, L), args.dB));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyB{}, tensor_aux, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+      return {tma_load_a, tma_load_b, tma_load_aux, args.scale_d0, args.scale_d1};
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementA>::value)) /
+          8 +
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementB>::value)) /
+          8 +
+      (size<0>(SmemLayoutAux{}) * size<1>(SmemLayoutAux{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementAux>::value)) /
+          8;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_aux.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// gAux_xkl - The tma tensor, A/B after a local tile so it has shape  (BLK_N,BLK_K,m/n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (SwapAB) {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    } else {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <class TensorA, class TensorB, class TensorAux, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write,
+                           cute::tuple<TensorA, TensorB, TensorAux> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                              SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                              SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+      Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+      Tensor gAux_xkl = get<2>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+      auto block_tma_aux = SwapAB
+                               ? mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.y)
+                               : mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.x);
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+      Tensor gAux =
+          SwapAB ? gAux_xkl(_, _, m_coord, _, l_coord) : gAux_xkl(_, _, n_coord, _, l_coord);
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAuxgAux = block_tma_aux.partition_S(gAux);
+      Tensor tAuxsAux = block_tma_aux.partition_D(sAux);
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_aux = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      if constexpr (SwapAB) {
+        mcast_mask_aux = mcast_mask_a;
+      } else {
+        mcast_mask_aux = mcast_mask_b;
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_aux.with(*tma_barrier, mcast_mask_aux),
+             tAuxgAux(_, _, _, *k_tile_iter), tAuxsAux(_, _, _, write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum0, FrgTensorC& accum1, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutAux{}) == 3, "Smem layout must be rank 3.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomA>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                            SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    auto tCsAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.partition_A(sAux);
+      } else {
+        return thread_mma.partition_B(sAux);
+      }
+    }();
+    auto tCrAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.make_fragment_A(tCsAux);
+      } else {
+        return thread_mma.make_fragment_B(tCsAux);
+      }
+    }();
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));    // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));    // PIPE
+    if constexpr (SwapAB) {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<1>(accum1));  // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum1));    // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tCsAux));    // PIPE
+    } else {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum1));    // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<2>(accum1));  // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsAux));    // PIPE
+    }
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sAux));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX),
+                  "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accum0);
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accum1);
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accum1);
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum0);
+      warpgroup_fence_operand(accum1);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accum0);
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accum1);
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accum1);
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write
+      /// is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum0);
+      warpgroup_fence_operand(accum1);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum0);
+    warpgroup_fence_operand(accum1);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 000000000..61762147e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_gated_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_, class ElementA_,
+          class StrideA_, class ElementB_, class StrideB_, class TiledMma_, class GmemTiledCopyA_,
+          class SmemLayoutAtomA_, class SmemCopyAtomA_, class TransformA_, class GmemTiledCopyB_,
+          class SmemLayoutAtomB_, class SmemCopyAtomB_, class TransformB_,
+          template <class /* ElementCompute */> class Activation_, bool SwapAB_>
+struct CollectiveMmaGated<
+    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>, TileShape_,
+    ElementA_, StrideA_, ElementB_, StrideB_, TiledMma_, GmemTiledCopyA_, SmemLayoutAtomA_,
+    SmemCopyAtomA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_,
+    Activation_, SwapAB_> {
+  static constexpr bool isGated = true;
+  static constexpr bool SwapAB = SwapAB_;
+
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using Activation = Activation_<ElementAccumulator>;
+
+  using ElementAux = cute::conditional_t<SwapAB, ElementA_, ElementB_>;
+  using ValTypeAux =
+      cute::conditional_t<SwapAB, typename TiledMma::ValTypeA, typename TiledMma::ValTypeB>;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t<::cutlass::gemm::detail::is_major<0, StrideB>(), Step<_2, _1, _3>,
+                    Step<_1, _2, _3>>{}));
+  using SmemLayoutAux = cute::conditional_t<SwapAB, SmemLayoutA, SmemLayoutB>;
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 1 or more.");
+  static_assert(
+      cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ValTypeAux, cute::cosize_v<SmemLayoutAux>> smem_Aux;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)),
+                    StrideA{}),
+        SmemLayoutA{}(_, _, 0), make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)),
+                    StrideB{}),
+        SmemLayoutB{}(_, _, 0), make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    using TMA_Aux = cute::conditional_t<SwapAB, TMA_A, TMA_B>;
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Aux tma_load_aux;
+    float scale_d0 = 1.0f;
+    float scale_d1 = 1.0f;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M, K, L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N, K, L), args.dB));
+    typename Params::TMA_A tma_load_a =
+        make_tma_copy(GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                      size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+    if constexpr (SwapAB) {
+      auto ptr_Aux = reinterpret_cast<ElementA const*>(args.ptr_A + size(make_shape(M, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(M, K, L), args.dA));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyA{}, tensor_aux, SmemLayoutA{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+                        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+      return {tma_load_a,    tma_load_b,    tma_load_aux,
+              args.scale_d0, args.scale_d1, args.mma_promotion_interval};
+    } else {
+      auto ptr_Aux = reinterpret_cast<ElementB const*>(args.ptr_B + size(make_shape(N, K, L)));
+      Tensor tensor_aux = make_tensor(ptr_Aux, make_layout(make_shape(N, K, L), args.dB));
+      typename Params::TMA_Aux tma_load_aux =
+          make_tma_copy(GmemTiledCopyB{}, tensor_aux, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                        size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+      return {tma_load_a,    tma_load_b,    tma_load_aux,
+              args.scale_d0, args.scale_d1, args.mma_promotion_interval};
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+    /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4
+     * MMA instructions. */
+    implementable = implementable && (args.mma_promotion_interval % 4 == 0);
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementA>::value)) /
+          8 +
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementB>::value)) /
+          8 +
+      (size<0>(SmemLayoutAux{}) * size<1>(SmemLayoutAux{}) *
+       static_cast<uint32_t>(sizeof_bits<ElementAux>::value)) /
+          8;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_aux.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// gAux_xkl - The tma tensor, A/B after a local tile so it has shape  (BLK_N,BLK_K,m/n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (SwapAB) {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    } else {
+      Tensor mAux_xkl =
+          mainloop_params.tma_load_aux.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+      Tensor gAux_xkl = local_tile(mAux_xkl, TileShape{}, make_coord(_, _, _),
+                                   Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, gAux_xkl);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <class TensorA, class TensorB, class TensorAux, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write,
+                           cute::tuple<TensorA, TensorB, TensorAux> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                              SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                              SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+      Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+      Tensor gAux_xkl = get<2>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+      auto block_tma_aux = SwapAB
+                               ? mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.y)
+                               : mainloop_params.tma_load_aux.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+      Tensor gAux =
+          SwapAB ? gAux_xkl(_, _, m_coord, _, l_coord) : gAux_xkl(_, _, n_coord, _, l_coord);
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAuxgAux = block_tma_aux.partition_S(gAux);
+      Tensor tAuxsAux = block_tma_aux.partition_D(sAux);
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_aux = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      if constexpr (SwapAB) {
+        mcast_mask_aux = mcast_mask_a;
+      } else {
+        mcast_mask_aux = mcast_mask_b;
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_aux.with(*tma_barrier, mcast_mask_aux),
+             tAuxgAux(_, _, _, *k_tile_iter), tAuxsAux(_, _, _, write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum0, FrgTensorC& accum1, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomA>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(
+        cute::is_void_v<SmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()),
+                            SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sAux = make_tensor(make_smem_ptr(shared_tensors.smem_Aux.data()), SmemLayoutAux{});
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    auto tCsAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.partition_A(sAux);
+      } else {
+        return thread_mma.partition_B(sAux);
+      }
+    }();
+    auto tCrAux = [&]() -> auto {
+      if constexpr (SwapAB) {
+        return thread_mma.make_fragment_A(tCsAux);
+      } else {
+        return thread_mma.make_fragment_B(tCsAux);
+      }
+    }();
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum0));  // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum0));  // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));    // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));    // PIPE
+    if constexpr (SwapAB) {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<1>(accum1));  // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum1));    // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tCsAux));    // PIPE
+    } else {
+      CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum1));    // M
+      CUTE_STATIC_ASSERT_V(size<1>(tCsAux) == size<2>(accum1));  // N
+      CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsAux));    // K
+      CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsAux));    // PIPE
+    }
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sAux));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS < K_PIPE_MAX),
+                  "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation0(accum0, mainloop_params.mma_promotion_interval,
+                                      size<2>(tCrA));
+    GmmaFP8Accumulation accumulation1(accum1, mainloop_params.mma_promotion_interval,
+                                      size<2>(tCrA));
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation0.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accumulation0());
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accumulation1());
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accumulation1());
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation0.promote_if_needed();
+      accumulation1.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation0.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation0());
+      warpgroup_fence_operand(accumulation1());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                   accumulation0());
+        if constexpr (SwapAB) {
+          cute::gemm(tiled_mma, tCrAux(_, _, k_block, read_stage), tCrB(_, _, k_block, read_stage),
+                     accumulation1());
+        } else {
+          cute::gemm(tiled_mma, tCrA(_, _, k_block, read_stage), tCrAux(_, _, k_block, read_stage),
+                     accumulation1());
+        }
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write
+      /// is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation0());
+      warpgroup_fence_operand(accumulation1());
+
+      accumulation0.promote_if_needed();
+      accumulation1.promote_if_needed();
+
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation0.promote_residue_if_needed();
+    accumulation1.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation0());
+    warpgroup_fence_operand(accumulation1());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 000000000..7432d90fa
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm90_mma_interleaved_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1528 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass_extensions/interleaved_numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <int Stages, class ClusterShape, class KernelSchedule, class TileShape_,
+          class ElementAOptionalTuple, class StrideA_, class ElementBOptionalTuple, class StrideB_,
+          class TiledMma_, class GmemTiledCopyA_, class SmemLayoutAtomA_, class SmemCopyAtomA_,
+          class TransformA_, class GmemTiledCopyB_, class SmemLayoutAtomB_, class SmemCopyAtomB_,
+          class TransformB_>
+struct CollectiveMmaInterleaved<
+    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>,
+    TileShape_, ElementAOptionalTuple, StrideA_, ElementBOptionalTuple, StrideB_, TiledMma_,
+    GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+    SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_> {
+ private:
+  template <class PointerType>
+  static constexpr auto get_logical_ptr(PointerType const* ptr) {
+    if constexpr (cute::sizeof_bits_v<PointerType> < 8) {
+      return subbyte_iterator<PointerType const>(ptr);
+    } else {
+      return ptr;
+    }
+  }
+
+  template <class WeightType, class ActivationType, class TileShape>
+  static constexpr auto get_smem_interleave_layout() {
+    if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                  cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_4, _4, _2, _4>>,
+                    Stride<_128, Stride<_1, _8, _4, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_2, _4, _4, _2>>,
+                    Stride<_64, Stride<_1, _8, _2, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 8 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<decltype(get<0>(TileShape{})), Shape<_2, _4, _2, _4>>,
+                    Stride<_64, Stride<_1, _4, _2, _16>>>{};
+    } else {
+      static_assert(dependent_false<WeightType, ActivationType>,
+                    "unsupported weight and activation, must be one of w4a8,w4a16,w8a16");
+    }
+  }
+
+  enum class ConversionMode { DirectConvert, ConvertAndScale, ConvertAndScaleWithZero };
+
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+ public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy =
+      MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+
+  static_assert(
+      cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value,
+      "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+      "[ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when
+  // the scale / zero is void.
+  using NonVoidElementScale =
+      cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using StrideB = StrideB_;
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the code to compile when
+  // the scale is void.
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>,
+                                                 cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert((IsATransformed && cutlass::gemm::detail::is_k_major<StrideA>()) ||
+                    (!IsATransformed && cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The transformed type must be K-major.");
+
+  static_assert((IsATransformed && (sizeof(ElementB) == 2)) ||
+                    (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                    (cutlass::gemm::detail::is_k_major<StrideA>() &&
+                     cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+                "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  // Scale layout atom set after swapping.
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::DefaultCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA =
+      cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB =
+      cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealInternalElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealInternalElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using InternalTransformA = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using InternalTransformB = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<InternalElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, InternalElementA>;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale =
+      Layout<Shape<decltype(cute::shape<0>(InternalSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape =
+      decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+  static constexpr int type_factor = sizeof_bits<ElementB>::value / sizeof_bits<ElementA>::value;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2,
+                "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0,
+                "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0,
+                "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideA>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+
+  using Layout_Interleave = decltype(cute::composition(
+      SmemLayoutA{}.layout_a(), SmemLayoutA{}.offset(),
+      get_smem_interleave_layout<InternalElementA, InternalElementB, TileShape>()));
+  using SmemLayoutA_mma_interleave = decltype(tile_to_shape(
+      Layout_Interleave{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideA>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+  using SmemLayoutA_mma = decltype(cute::composition(
+      SmemLayoutA{}.layout_a(), SmemLayoutA{}.offset(),
+      make_layout(
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+          make_stride(get<2>(TileShape{}), _1{}, get<0>(TileShape{}) * get<2>(TileShape{})))));
+  //  cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(),
+  //                       Stride<_1, cute::Int<shape<0>(TileShape{})>,
+  //                       cute::Int<get<0>(TileShape{}) * get<2>(TileShape{})>>,
+  //                       Stride<cute::Int<shape<2>(TileShape{})>, _1,
+  //                       cute::Int<get<0>(TileShape{}) * get<2>(TileShape{})>>>{})));
+
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, InternalStrideB>(), Step<_2, _1, _3>,
+                          Step<_1, _2, _3>>{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+      SmemLayoutAtomScale{},
+      make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+      cute::conditional_t<::cutlass::gemm::detail::is_major<0, NonVoidStrideScale>(),
+                          Step<_2, _1, _3>, Step<_1, _2, _3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert(
+      not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+          cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+      "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+                "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop
+  // iteration. We must also handle updating the pipeline transaction bytes on the fly. NOTE:
+  // Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+ private:
+  static constexpr ConversionMode get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    } else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales =
+      KernelConversionMode == ConversionMode::ConvertAndScale ||
+      KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+
+  static constexpr auto elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    } else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return 0;
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after
+  // the public section.
+  static constexpr uint32_t compute_tma_transaction_bytes_mk() {
+    constexpr uint32_t baseline_bytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) *
+                               static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementA>));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return baseline_bytes;
+    } else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes =
+          cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                 static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0,
+                    "Each scale stage must be 128B aligned.");  // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return baseline_bytes + scale_tx_bytes;
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes =
+            cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) *
+                                   static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0,
+                      "Each zero stage must be 128B aligned.");  // required by TMA
+        return baseline_bytes + scale_tx_bytes + zero_tx_bytes;
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Type not handled in tma transaction bytes computation.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  static constexpr uint32_t compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) *
+                                  static_cast<uint32_t>(cute::sizeof_bits_v<InternalElementB>));
+  }
+
+ public:
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = elements_per_smem_scale();
+    static constexpr int zero_elements = elements_per_smem_zero();
+
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB)> {
+      cute::ArrayEngine<RealInternalElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+   private:
+    using Outer =
+        CollectiveMmaInterleaved<DispatchPolicy, TileShape_, ElementAOptionalTuple, StrideA_,
+                                 ElementBOptionalTuple, StrideB_, TiledMma_, GmemTiledCopyA_,
+                                 SmemLayoutAtomA_, SmemCopyAtomA_, TransformA_, GmemTiledCopyB_,
+                                 SmemLayoutAtomB_, SmemCopyAtomB_, TransformB_>;
+
+   public:
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementA const*>(nullptr)),
+                    repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+
+    using TMA_Scale = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(Outer::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)),
+                    repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF
+                 // kernel
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(Outer::get_logical_ptr(static_cast<InternalElementB const*>(nullptr)),
+                    repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_, _, cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})));  // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape,
+                                                  Arguments const& args, void* workspace) {
+    (void)workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    } else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(get_logical_ptr(ptr_A), make_layout(make_shape(M, K, L), dA));
+    Tensor tensor_b = make_tensor(get_logical_ptr(ptr_B), make_layout(make_shape(N, K, L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b =
+        make_tma_copy(GmemTiledCopyB{}, tensor_b, SmemLayoutB{}(_, _, cute::Int<0>{}),
+                      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+                      size<0>(ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    typename Params::TMA_Scale tma_load_scale;
+    typename Params::TMA_Zero tma_load_zero;
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return {tma_load_a, tma_load_b,          tma_load_scale,        tma_load_zero,        0,
+              0,          TmaTransactionBytes, TmaTransactionBytesMK, TmaTransactionBytesNK};
+    } else if constexpr (ModeHasScales) {
+      auto scale_k = (K + args.group_size - 1) / args.group_size;
+      ElementScale const* ptr_S = args.ptr_S;
+      StrideScale dS = args.dS;
+      Tensor tensor_scale =
+          make_tensor(get_logical_ptr(ptr_S), make_layout(make_shape(M, scale_k, L), dS));
+      tma_load_scale =
+          make_tma_copy(GmemTiledCopyScale{}, tensor_scale, SmemLayoutScale{}(_, _, cute::Int<0>{}),
+                        ScaleTileShape{}, _1{});  // mcast along N mode for this M load, if any
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return {tma_load_a,
+                tma_load_b,
+                tma_load_scale,
+                tma_load_zero,
+                scale_k,
+                args.group_size,
+                TmaTransactionBytes,
+                TmaTransactionBytesMK,
+                TmaTransactionBytesNK};
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero =
+            make_tensor(get_logical_ptr(args.ptr_Z), make_layout(make_shape(M, scale_k, L), dS));
+        tma_load_zero = make_tma_copy(GmemTiledCopyScale{}, tensor_zero,
+                                      SmemLayoutScale{}(_, _, cute::Int<0>{}), ScaleTileShape{},
+                                      _1{});  // mcast along N mode for this M load, if any
+        return {tma_load_a,
+                tma_load_b,
+                tma_load_scale,
+                tma_load_zero,
+                scale_k,
+                args.group_size,
+                TmaTransactionBytes,
+                TmaTransactionBytesMK,
+                TmaTransactionBytesNK};
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static bool can_implement(ProblemShape const& problem_shape,
+                            [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(
+                                         cute::make_shape(M, K, L), StrideA{});
+    constexpr int min_tma_aligned_elements_B =
+        tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(
+                                         cute::make_shape(N, K, L), StrideB{});
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      implementable = implementable && (args.ptr_S == nullptr);
+      implementable = implementable && (args.ptr_Z == nullptr);
+    } else if constexpr (ModeHasScales) {
+      int const scale_mn = SwapAB ? N : M;
+      int const scale_k = (K + args.group_size - 1) / args.group_size;
+      constexpr int min_tma_aligned_elements_scale =
+          tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      implementable =
+          implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(
+                               cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+      implementable = implementable &&
+                      (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      implementable = implementable && args.group_size != 0;
+      implementable = implementable && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        implementable = implementable && (args.ptr_Z == nullptr);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero =
+            tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        implementable =
+            implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(
+                                 cute::make_shape(scale_mn, scale_k, L), StrideScale{});
+        implementable = implementable && (args.ptr_Z != nullptr);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in can_implement.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in can_implement.");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST(
+          "  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for "
+          "TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA prefetch.");
+    }
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto load_init(ProblemShape_MNKL const& problem_shape_MNKL,
+                                Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M, K, L));  // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N, K, L));  // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _),
+                               Step<_1, X, _1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _),
+                               Step<X, _1, _1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    } else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(
+          make_shape(M, scale_k, L));  // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{},
+                                 make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(
+            make_shape(M, scale_k, L));  // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{},
+                                   make_coord(_, _));  // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in load_init.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <class... Ts, class KTileIterator, class BlockCoord>
+  CUTLASS_DEVICE void load(Params const& mainloop_params, MainloopPipeline pipeline,
+                           PipelineState smem_pipe_write, cute::tuple<Ts...> const& load_inputs,
+                           BlockCoord const& blk_coord, KTileIterator k_tile_iter, int k_tile_count,
+                           int thread_idx, uint32_t block_rank_in_cluster,
+                           TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof...(Ts) == 2, "Direct convert needs two inputs");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof...(Ts) == 3, "Scaled convert needs three inputs");
+    } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof...(Ts) == 4, "Scaled and zero convert needs four inputs");
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in TMA load.");
+    }
+
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                               SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                               SmemLayoutB{});                  // (BLK_N,BLK_K,PIPE)
+      Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+      Tensor sB = as_position_independent_swizzle_tensor(sB_);  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B and Scales
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x,
+                                      block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_, _, n_coord, _, l_coord);  // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);  // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);  // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);  // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+      uint16_t mcast_mask_s = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x, n, Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};  // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, Int<0>{}));
+        }
+      }
+
+      auto extra_input_partitions = partition_extra_tma_inputs(
+          mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a),
+             tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b),
+             tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          // Nothing extra to do.
+        } else if constexpr (ModeHasScales) {
+          auto tSgS = get<0>(extra_input_partitions);
+          auto tSsS = get<1>(extra_input_partitions);
+
+          // Temporary factor which will determine which k tile to reload from gmem. Needed so we
+          // don't modify tma transaction bytes on the fly. We must do a ceiling divide here to
+          // correctly handle with group_size == K. In that case, we don't require that K is a
+          // multiple of the threadblock tile K
+          int const ReloadFactor =
+              (mainloop_params.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+          int const scale_load_k =
+              *k_tile_iter / ReloadFactor;  // This will always be 0 when group_size == K.
+          copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s),
+               tSgS(_, _, _, scale_load_k), tSsS(_, _, _, write_stage));
+
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            // Nothing extra to do
+          } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            auto tZgZ = get<2>(extra_input_partitions);
+            auto tZsZ = get<3>(extra_input_partitions);
+            copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s),
+                 tZgZ(_, _, _, scale_load_k), tZsZ(_, _, _, write_stage));
+          } else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                          "Conversion mode not handled for TMA copy op.");
+          }
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled for TMA copy op.");
+        }
+
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  template <class WeightType, class ActivationType>
+  constexpr auto interleave_for_mixed_input() {
+    if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                  cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<Shape<_4, _2, _2>, _1, Shape<_2, _2>>,
+                    Stride<Stride<_1, _8, _4>, _0, Stride<_16, _32>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 4 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<Shape<_2, _2, _2, _2>, _1, Shape<_2>>,
+                    Stride<Stride<_1, _4, _8, _2>, _0, Stride<_16>>>{};
+    } else if constexpr (cute::sizeof_bits_v<WeightType> == 8 &&
+                         cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<Shape<_2, _2, _2>, _1, Shape<_2, _2>>,
+                    Stride<Stride<_1, _4, _2>, _0, Stride<_8, _16>>>{};
+    } else {
+      static_assert(dependent_false<WeightType, ActivationType>,
+                    "unsupported weight and activation, must be one of w4a8,w4a16,w8a16");
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void mma(MainloopPipeline pipeline, PipelineState smem_pipe_read,
+                          FrgTensorC& accum, int k_tile_count, int thread_idx,
+                          TensorStorage& shared_tensors, Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2,
+                  "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2,
+                  "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(
+        !cute::is_void_v<InternalSmemCopyAtomA>,
+        "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
+    static_assert(
+        cute::is_void_v<InternalSmemCopyAtomB>,
+        "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()),
+                             SmemLayoutA_mma_interleave{});   // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);  // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()),
+                            SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                      size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be "
+                  "NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout =
+        make_layout(Int<MmaWarpGroups>{}, Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto interleave_layout = interleave_for_mixed_input<InternalElementA, InternalElementB>();
+
+    auto interleave_remapping =
+        cute::flat_product(interleave_layout, Layout<Shape<Int<DispatchPolicy::Stages>>>{});
+
+    Tensor tCsA_remapped = tCsA.compose(interleave_remapping);
+
+    auto interleave_remapping_thread = right_inverse(interleave_layout);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma =
+        mma_thread_slice.partition_fragment_A(sA(_, _, Int<0>{}));  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = make_fragment_like<RealInternalElementA>(tCrA_mma);
+
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA_load);  // (CPY,CPY_M,CPY_K)
+
+    // Compute the max vector length that can be used to copy A. This will match the vector width of
+    // the conversions used. It helps by allowing the compiler to convert using the same register
+    // that was used to load the data from smem. This significantly reduces the need to move data
+    // among registers. Note that this is correct even if copy fails to vectorize, since the
+    // granularity at which we perform the conversion does not impact correctness.
+    using A_CPY_VEC = decltype(max_common_vector(tCsA, tCrA_copy_view));
+    using A_CPY_VEC_remapped = decltype(max_common_vector(tCsA_remapped, tCrA_copy_view));
+    static_assert(A_CPY_VEC_remapped{} == 32 / cutlass::sizeof_bits<InternalElementA>::value,
+                  "max_common_vector(tCsA_remapped, tCrA_copy_view) is 32 / "
+                  "cutlass::sizeof_bits<InternalElementA>::value");
+    auto tCrA_mma_tmp = tCrA_mma.compose(interleave_remapping_thread);
+    auto tCrA_mma_inverse_mapping = tCrA_mma_tmp.compose(tCrA_mma.layout());
+
+    auto tCrA_load_tmp = tCrA_load.compose(interleave_remapping_thread);
+    auto tCrA_load_inverse_mapping = tCrA_load_tmp.compose(tCrA_load.layout());
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info =
+        retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));      // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));      // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));           // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));               // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));  // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+
+    constexpr int kNumKIterationsPerWarpBLoad = type_factor / 2;
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                            partitioned_extra_info, copy_partitions_extra_info, 0, read_stage,
+                            kNumKIterationsPerWarpBLoad);
+      if (K_BLOCK_MAX > 1) {  // prefetch next block
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                              partitioned_extra_info, copy_partitions_extra_info, 1, read_stage,
+                              kNumKIterationsPerWarpBLoad);
+      }
+
+      transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                         kNumKIterationsPerWarpBLoad);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2)  // prefetch next block
+        {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A
+        // registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                              partitioned_extra_info, copy_partitions_extra_info, 0,
+                              smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+        if (K_BLOCK_MAX > 1) {  // prefetch next block
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, 1,
+                                smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+        }
+        warpgroup_wait<K_BLOCK_MAX - kNumKIterationsPerWarpBLoad>();
+        transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                           kNumKIterationsPerWarpBLoad);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_BLOCK_MAX -
+                       kNumKIterationsPerWarpBLoad>();  // We have K_BLOCK_MAX - 1 GMMA instructions
+                                                        // pending for this stage, so we can release
+                                                        // prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, 0,
+                                smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+          if (K_BLOCK_MAX > 1) {  // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                  partitioned_extra_info, copy_partitions_extra_info, 1,
+                                  smem_pipe_read.index(), kNumKIterationsPerWarpBLoad);
+          }
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info, 0,
+                             kNumKIterationsPerWarpBLoad);
+        } else {
+          if (k_block < K_BLOCK_MAX - 2) {  // prefetch next block
+            copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                  partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                  read_stage, kNumKIterationsPerWarpBLoad);
+          }
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma_inverse_mapping(_, _, k_block),
+                   tCrB(_, _, k_block, read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<K_BLOCK_MAX - kNumKIterationsPerWarpBLoad>();
+        if (k_block == K_BLOCK_MAX - 1)  // release prior barrier
+        {
+          pipeline.consumer_release(
+              smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2)  // prefetch next block
+        {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 2,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+        }
+
+        if (k_block < K_BLOCK_MAX - 1) {
+          copy_A_and_extra_info(smem_tiled_copy_A, tCsA_remapped, tCrA_copy_view,
+                                partitioned_extra_info, copy_partitions_extra_info, k_block + 1,
+                                read_stage, kNumKIterationsPerWarpBLoad);
+          transform_A_kblock(tCrA_load, A_CPY_VEC_remapped{}, tCrA_mma, partitioned_extra_info,
+                             k_block + 1, kNumKIterationsPerWarpBLoad);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release,
+                               int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(
+          smem_pipe_release);  // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+ private:
+  /// Utilities for any additional inputs inside of the TMA load
+  template <class... Ts>
+  CUTLASS_DEVICE auto partition_extra_tma_inputs(Params const& mainloop_params,
+                                                 cute::tuple<Ts...> const& load_inputs,
+                                                 TensorStorage& shared_tensors,
+                                                 uint2 const& cluster_local_block_id,
+                                                 int const m_coord, int const l_coord) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);  // (TMA,TMA_M,TMA_K,k)
+      Tensor tSsS = block_tma_s.partition_D(sS);  // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_, _, m_coord, _, l_coord);  // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);  // (TMA,TMA_M,TMA_K,k)
+        Tensor tZsZ = block_tma_z.partition_D(sZ);  // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled for input partitioning.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  template <class ActivationType>
+  constexpr auto scale_remapping() {
+    if constexpr (cute::sizeof_bits_v<ActivationType> == 8) {
+      return Layout<Shape<_4, _2, _2>, Stride<_1, _8, _4>>{};
+    } else if constexpr (cute::sizeof_bits_v<ActivationType> == 16) {
+      return Layout<Shape<_2, _2, _2>, Stride<_1, _4, _2>>{};
+    } else {
+      static_assert(dependent_false<ActivationType>,
+                    "cute::sizeof_bits_v<ActivationType> must be 8 or 16");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <class ThreadMma>
+  CUTLASS_DEVICE auto partition_extra_mma_info(ThreadMma const& mma_thread_slice,
+                                               TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()),
+                              SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      auto remappingScale = scale_remapping<InternalElementB>();
+      Tensor tCsS_remapped = tCsS.compose(remappingScale, _, _, _);
+      Tensor tCrS = make_tensor<ElementScale>(
+          mma_thread_slice.partition_fragment_A(sS(_, _, Int<0>{})).shape());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS_remapped, tCrS);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()),
+                                SmemLayoutScale{});  // (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCsZ_remapped = tCsZ.compose(remappingScale, _, _, _);
+        Tensor tCrZ = make_tensor<ElementZero>(
+            mma_thread_slice.partition_fragment_A(sZ(_, _, Int<0>{})).shape());
+        return cute::make_tuple(tCsS_remapped, tCrS, tCsZ_remapped, tCrZ);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE auto retile_extra_mma_info(TiledMma const& tiled_mma,
+                                            cute::tuple<Ts...>& partitioned_extra_info,
+                                            int const warp_group_thread_idx) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    } else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view =
+          smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view =
+            smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));  // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    } else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                    "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA, class TensorASmemView, class TensorACopyView, class... Ts,
+            class... Us>
+  CUTLASS_DEVICE void copy_A_and_extra_info(SmemTiledCopyA const& smem_tiled_copy_A,
+                                            TensorASmemView const& tCsA,
+                                            TensorACopyView& tCrA_copy_view,
+                                            cute::tuple<Ts...> const& partitioned_mma_extra_info,
+                                            cute::tuple<Us...> const& tiled_copy_and_views,
+                                            int k_block, int read_stage,
+                                            int kNumKIterationsPerWarpBLoad) {
+    if (kNumKIterationsPerWarpBLoad == 1) {
+      copy(smem_tiled_copy_A, tCsA(_, _, k_block, read_stage), tCrA_copy_view(_, _, k_block));
+    } else {
+      using reshape_layout = Layout<Shape<Int<16>, Int<1>, Int<2>>>;
+      auto tCrA_copy_view_reshaped = tCrA_copy_view.compose(reshape_layout{});
+      if (k_block % kNumKIterationsPerWarpBLoad == 0)
+        copy(smem_tiled_copy_A, tCsA(_, _, k_block / kNumKIterationsPerWarpBLoad, read_stage),
+             tCrA_copy_view_reshaped(_, _, k_block / kNumKIterationsPerWarpBLoad));
+    }
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      } else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view = cute::get<1>(tiled_copy_and_views);
+        auto tCsS = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_, _, k_block, read_stage), tCrS_copy_view(_, _, k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_, _, k_block, read_stage), tCrZ_copy_view(_, _, k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                        "Conversion mode not handled in A -> RF path.");
+        }
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>,
+                      "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  /// Utilities to transform A.
+  template <class TCrA_load, int VectorWidthA, class TCrA_mma, class... Ts>
+  CUTLASS_DEVICE void transform_A_kblock(TCrA_load const& tCrA_load, cute::Int<VectorWidthA> vec_A,
+                                         TCrA_mma& tCrA_mma,
+                                         cute::tuple<Ts...> const& partitioned_extra_info,
+                                         int const k_block, int kNumKIterationsPerWarpBLoad) {
+    if (kNumKIterationsPerWarpBLoad != 1) {
+      if (k_block % kNumKIterationsPerWarpBLoad == 0) {
+        int k_block_load = k_block / kNumKIterationsPerWarpBLoad;
+        using reshape_layout = Layout<Shape<Shape<_2, _2, _2, _2>, _1, _2>>;
+        auto tCrA_load_reshaped = tCrA_load.compose(reshape_layout{});
+        auto tCra_mma_reshaped = tCrA_mma.compose(reshape_layout{});
+
+        using scale_reshape =
+            Layout<Shape<Shape<_2, _2, _2, _2>, _1, _1>, Stride<Stride<_0, _0, _0, _4>, _0, _0>>;
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          transform_internal_A(tCrA_load_reshaped(_, _, k_block_load), vec_A,
+                               tCra_mma_reshaped(_, _, k_block_load));
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          auto tCrS_reshaped = tCrS.compose(scale_reshape{});
+          transform_internal_A(
+              tCrA_load_reshaped(_, _, k_block_load), vec_A,
+              make_fragment_like<ElementScale>(tCra_mma_reshaped)(_, _, k_block_load),
+              tCrS_reshaped(_, _, 0), tCra_mma_reshaped(_, _, k_block_load));
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCrS = cute::get<1>(partitioned_extra_info);
+          auto tCrS_reshaped = tCrS.compose(scale_reshape{});
+          auto tCrZ = cute::get<3>(partitioned_extra_info);
+          auto tCrZ_reshaped = tCrZ.compose(scale_reshape{});
+          transform_internal_A(
+              tCrA_load_reshaped(_, _, k_block_load), vec_A,
+              make_fragment_like<ElementScale>(tCra_mma_reshaped)(_, _, k_block_load),
+              tCrS_reshaped(_, _, 0), tCrZ_reshaped(_, _, 0),
+              tCra_mma_reshaped(_, _, k_block_load));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+        }
+      }
+    } else {
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A, tCrA_mma(_, _, k_block));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A,
+                             make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block),
+                             tCrS(_, _, 0), tCrA_mma(_, _, k_block));
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        auto tCrS = cute::get<1>(partitioned_extra_info);
+        auto tCrZ = cute::get<3>(partitioned_extra_info);
+        transform_internal_A(tCrA_load(_, _, k_block), vec_A,
+                             make_fragment_like<ElementScale>(tCrA_mma)(_, _, k_block),
+                             tCrS(_, _, 0), tCrZ(_, _, 0), tCrA_mma(_, _, k_block));
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+      }
+    }
+  }
+
+  /// Utilities for transforming the A operand prior to issuing tensorcore math.
+  template <class EngineIn, class EngineOut, class TensorLayout,
+            int ConversionVectorWidth = cosize_v<TensorLayout>>
+  CUTLASS_DEVICE void convert_tensor(Tensor<EngineIn, TensorLayout> const& in,
+                                     Tensor<EngineOut, TensorLayout>& out,
+                                     cute::Int<ConversionVectorWidth> width = {}) {
+    /// This is an element-wise conversion where we expect both tensors to have the same layout.
+    /// As a result, we can cast as a cutlass array to use the fast numeric converters without
+    /// worrying about indexing into the layout.
+    constexpr int N = cosize_v<TensorLayout>;
+
+    /// The inputs must be backed by registers & be statically sized.
+    static_assert(is_rmem<EngineIn>::value,
+                  "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value,
+                  "Output tensor for A conversion must come from registers");
+    static_assert(is_static_v<TensorLayout>, "Tensor layout for the conversion must be static");
+    static_assert(cosize_v<TensorLayout> == size(TensorLayout{}),
+                  "Cosize and size of the layout must be equal.");
+    static_assert(N % ConversionVectorWidth == 0,
+                  "Conversion vector width must divide cosize of the tensor layout.");
+
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    using SrcArray = cutlass::Array<SrcType, ConversionVectorWidth>;
+    using DstArray = cutlass::Array<DstType, ConversionVectorWidth>;
+
+    using Converter =
+        std::conditional_t < cutlass::sizeof_bits_v<SrcType>
+        <cutlass::sizeof_bits_v<DstType>,
+         cutlass::FastInterleavedAndBiasedNumericArrayConverter<DstType, SrcType,
+                                                                ConversionVectorWidth>,
+         cutlass::NumericArrayConverter<DstType, SrcType, ConversionVectorWidth>>;
+
+    constexpr int NumIterations = N / ConversionVectorWidth;
+
+    for (int ii = 0; ii < NumIterations; ++ii) {
+      SrcArray const* src_array_ptr =
+          reinterpret_cast<SrcArray const*>(raw_pointer_cast(in.data())) + ii;
+      DstArray* dst_array_ptr = reinterpret_cast<DstArray*>(raw_pointer_cast(out.data())) + ii;
+      *dst_array_ptr = Converter::convert(*src_array_ptr);
+    }
+  }
+
+  template <class EngineIn, class EngineOut, class TensorLayout, int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(Tensor<EngineIn, TensorLayout>&& in,
+                                           cute::Int<A_VectorConversionWidth> a_vec_width,
+                                           Tensor<EngineOut, TensorLayout>&& out) {
+    convert_tensor(in, out, a_vec_width);
+  }
+
+  template <class EngineIn, class EngineInputBuffer, class EngineScale, class EngineOut,
+            class TensorLayout, class TensorScaleLayout, int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(
+      Tensor<EngineIn, TensorLayout>&& in, cute::Int<A_VectorConversionWidth> a_vec_width,
+      Tensor<EngineInputBuffer, TensorLayout>&& converted_inputs,
+      Tensor<EngineScale, TensorScaleLayout>&& scales, Tensor<EngineOut, TensorLayout>&& out) {
+    static_assert(
+        cute::is_same_v<typename EngineInputBuffer::value_type, typename EngineScale::value_type>,
+        "Type of the engine input buffer must equal the scale buffer");
+
+    // First, we upcast the inputs to the scale type
+    convert_tensor(in, converted_inputs, a_vec_width);
+
+    // Apply scales and broadcast across inputs, store in converted_inputs
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(converted_inputs); ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size<0>(converted_inputs); ++j) {
+        if constexpr (cute::is_same_v<typename EngineScale::value_type, cutlass::bfloat16_t>) {
+          converted_inputs(j, i) =
+              bfloat16_t(__hmul(reinterpret_cast<__nv_bfloat16 const&>(converted_inputs(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(scales(j, i))));
+        } else {
+          converted_inputs(j, i) *= scales(j, i);
+        }
+      }
+    }
+
+    // Finally, we convert the scaled inputs to the mma type.
+    convert_tensor(converted_inputs, out);
+  }
+
+  template <class EngineIn, class EngineInputBuffer, class EngineScale, class EngineZero,
+            class EngineOut, class TensorLayout, class TensorScaleLayout,
+            int A_VectorConversionWidth>
+  CUTLASS_DEVICE void transform_internal_A(
+      Tensor<EngineIn, TensorLayout>&& in, cute::Int<A_VectorConversionWidth> a_vec_width,
+      Tensor<EngineInputBuffer, TensorLayout>&& converted_inputs,
+      Tensor<EngineScale, TensorScaleLayout>&& scales,
+      Tensor<EngineZero, TensorScaleLayout>&& zeros, Tensor<EngineOut, TensorLayout>&& out) {
+    static_assert(
+        cute::is_same_v<typename EngineInputBuffer::value_type, typename EngineScale::value_type>,
+        "Type of the engine input buffer must equal the scale buffer");
+
+    static_assert(
+        cute::is_same_v<typename EngineZero::value_type, typename EngineScale::value_type>,
+        "Type of the engine zero buffer must equal the scale buffer");
+
+    // First, we upcast the inputs to the scale type
+    convert_tensor(in, converted_inputs, a_vec_width);
+
+    // Apply scales and broadcast across inputs, store in converted_inputs
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size<1>(converted_inputs); ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size<0>(converted_inputs); ++j) {
+        if constexpr (cute::is_same_v<typename EngineScale::value_type, cutlass::bfloat16_t>) {
+          converted_inputs(j, i) =
+              bfloat16_t(__hfma(reinterpret_cast<__nv_bfloat16 const&>(converted_inputs(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(scales(j, i)),
+                                reinterpret_cast<__nv_bfloat16 const&>(zeros(j, i))));
+        } else {
+          converted_inputs(j, i) = converted_inputs(j, i) * scales(j, i) + zeros(j, i);
+        }
+      }
+    }
+
+    // Finally, we convert the scaled inputs to the mma type.
+    convert_tensor(converted_inputs, out);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
index eb653b416..3e291281e 100644
--- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h
@@ -262,7 +262,6 @@ enum class ClusterShape {
   ClusterShape_1x2x1,
   ClusterShape_2x2x1,
   ClusterShape_1x4x1,
-  ClusterShape_4x1x1,
   ClusterShape_4x2x1,
   ClusterShape_2x4x1,
   ClusterShape_4x4x1,
@@ -279,8 +278,6 @@ static auto get_cluster_shape_name(ClusterShape Shape_MNK) {
     return "1x2x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
     return "2x2x1";
-  } else if (Shape_MNK == ClusterShape::ClusterShape_4x1x1) {
-    return "4x1x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
     return "1x8x1";
   } else if (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
@@ -300,8 +297,6 @@ constexpr auto get_cluster_shape() {
     return cute::Shape<_1, _2, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_2x2x1) {
     return cute::Shape<_2, _2, _1>{};
-  } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_4x1x1) {
-    return cute::Shape<_4, _1, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_1x8x1) {
     return cute::Shape<_1, _8, _1>{};
   } else if constexpr (Shape_MNK == ClusterShape::ClusterShape_8x1x1) {
@@ -378,8 +373,8 @@ struct CutlassGemmConfig {
         is_tma_warp_specialized(true) {}
 
   int getTileConfigAsInt() const {
-    if (sm_version == 120 || sm_version == 121) return (int)tile_config_sm120;
-    if (sm_version >= 100 && sm_version < 120) return (int)tile_config_sm100;
+    if (sm_version == 120) return (int)tile_config_sm120;
+    if (sm_version >= 100) return (int)tile_config_sm100;
     if (sm_version == 90) return (int)tile_config_sm90;
     if (sm_version < 90) return (int)tile_config_sm80;
     assert(false && "Invalid SM version");
@@ -416,22 +411,22 @@ struct CutlassGemmConfig {
 
 inline std::ostream& operator<<(std::ostream& out, CutlassGemmConfig const& config) {
   // clang-format off
-    if (config.is_tma_warp_specialized)
-    {
-        out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
-            << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
-            << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
-            << ", cluster_shape_enum: " << int(config.cluster_shape)
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
-    else
-    {
-        out << "tile_config_enum: " << config.getTileConfigAsInt()
-            << ", split_k_style_enum: " << int(config.split_k_style)
-            << ", split_k_factor: " << config.split_k_factor
-            << ", stages: " << config.stages
-            << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
-    }
+     if (config.is_tma_warp_specialized)
+     {
+         out << "tile_config_sm90_enum: " << config.getTileConfigAsInt()
+             << ", mainloop_schedule_enum: " << int(config.mainloop_schedule)
+             << ", epilogue_schedule_enum: " << int(config.epilogue_schedule)
+             << ", cluster_shape_enum: " << int(config.cluster_shape)
+             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+     }
+     else
+     {
+         out << "tile_config_enum: " << config.getTileConfigAsInt()
+             << ", split_k_style_enum: " << int(config.split_k_style)
+             << ", split_k_factor: " << config.split_k_factor
+             << ", stages: " << config.stages
+             << ", enable_cuda_kernel: " << (config.enableCudaKernel ? "true" : "false");
+     }
   // clang-format on
   return out;
 }
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..63864ea3e
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..e02b80beb
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu
new file mode 100644
index 000000000..418f7bde7
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..20a228405
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..9ea1ad39d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu
new file mode 100644
index 000000000..ec68288b1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu
new file mode 100644
index 000000000..0862b0979
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS,
+                                        half,          /*Scale and Zero Type*/
+                                        __nv_bfloat16, /*Bias type Type*/
+                                        __nv_bfloat16  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu
new file mode 100644
index 000000000..10a61b5ca
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu
new file mode 100644
index 000000000..3c8bde88b
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY,
+                                        half,          /*Scale and Zero Type*/
+                                        __nv_bfloat16, /*Bias type Type*/
+                                        __nv_bfloat16  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu
new file mode 100644
index 000000000..292585ef1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu
new file mode 100644
index 000000000..5d044fd3d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+#ifdef ENABLE_FP8
+template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3,     /*Activation Type*/
+                                        cutlass::uint4b_t, /*Weight Type*/
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY,
+                                        half, /*Scale and Zero Type*/
+                                        half, /*Bias type Type*/
+                                        half  /*Output type Type*/
+                                        >;
+#endif
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..0ee34abc2
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..0658cd080
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu
new file mode 100644
index 000000000..d61be691d
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu
new file mode 100644
index 000000000..98ef82ae1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu
new file mode 100644
index 000000000..af0673bbc
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu
new file mode 100644
index 000000000..0b453ff9a
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+template class CutlassFpAIntBGemmRunner<half, uint8_t,
+                                        cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY>;
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
new file mode 100644
index 000000000..e535bdfa1
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <vector>
+
+#include "../include/common.h"
+#include "cutlass_extensions/gemm_configs.h"
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+/*
+  This runner only supports:
+  T in {half, __nv_bfloat} WeightType in {int8_t, cutlass::uint4b_t}
+
+  Activations, biases, scales and outputs are all assumed to be row-major.
+
+  However, it is assumed that B is in a special format governed by
+  cutlass_extensions/gemm/kernel/mixed_gemm_B_layout. In this case, B must be preprocessed using the
+  cutlass weight only quant preprocessors. The weight preprocessor will instantiate the layout and
+  preprocess based on the instantiation, so layout changes should only require modifications to
+  mix_gemm_B_layout.h.
+*/
+
+class CutlassFpAIntBGemmRunnerInterface {
+ public:
+  CutlassFpAIntBGemmRunnerInterface() {}
+
+  virtual ~CutlassFpAIntBGemmRunnerInterface() {}
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales, void* C, int m, int n,
+                    int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+                    const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales, float const alpha,
+                    void* C, int m, int n, int k, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales,
+                    void const* weight_zero_points, void const* biases, void* C, int m, int n,
+                    int k, int const group_size, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  virtual void gemm(void const* A, void const* B, void const* weight_scales,
+                    void const* weight_zero_points, void const* biases, float const alpha, void* C,
+                    int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemmConfig,
+                    char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream) = 0;
+
+  // Returns desired workspace size in bytes.
+  virtual size_t getWorkspaceSize(int const m, int const n, int const k) = 0;
+
+  virtual std::vector<tkc::CutlassGemmConfig> getConfigs() const = 0;
+
+ protected:
+  static constexpr int SPLIT_K_LIMIT = 7;
+  static constexpr int MIN_M_TILE = 16;
+  static constexpr int MIN_N_TILE = 64;
+
+  static constexpr int MAX_M_TILE_SM90 = 128;
+  static constexpr int MAX_N_TILE_SM90 = 256;
+};
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType = ActivationType, typename BiasType = ActivationType,
+          typename OutputType = ActivationType>
+class CutlassFpAIntBGemmRunner : public virtual CutlassFpAIntBGemmRunnerInterface {
+ public:
+  CutlassFpAIntBGemmRunner();
+  ~CutlassFpAIntBGemmRunner();
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void* C, int m, int n, int k,
+            tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+            cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, float const alpha, void* C,
+            int m, int n, int k, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+            const size_t workspace_bytes, cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void const* weight_zero_points,
+            void const* biases, void* C, int m, int n, int k, int const group_size,
+            tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr, const size_t workspace_bytes,
+            cudaStream_t stream) override;
+
+  void gemm(void const* A, void const* B, void const* weight_scales, void const* weight_zero_points,
+            void const* biases, float const alpha, void* C, int m, int n, int k,
+            int const group_size, tkc::CutlassGemmConfig gemmConfig, char* workspace_ptr,
+            const size_t workspace_bytes, cudaStream_t stream) override;
+
+  // Disabled since the fused GEMM, activation kernels will not be used in v1.
+
+  // void gemm_bias_act(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T*
+  // C, int m, int n,
+  //     int k, ActivationType activation_type, char* workspace_ptr, const size_t workspace_bytes,
+  //     cudaStream_t stream);
+
+  // Returns desired workspace size in bytes.
+  size_t getWorkspaceSize(int const m, int const n, int const k) override;
+
+  std::vector<tkc::CutlassGemmConfig> getConfigs() const override;
+
+ private:
+  template <typename EpilogueTag>
+  void dispatch_to_arch(ActivationType const* A, WeightType const* B,
+                        ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points,
+                        BiasType const* biases, float const alpha, OutputType* C, int m, int n,
+                        int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+                        char* workspace_ptr, const size_t workspace_bytes, cudaStream_t stream,
+                        int* occupancy = nullptr);
+
+ private:
+  int sm_;
+  int multi_processor_count_;
+};
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
new file mode 100644
index 000000000..14ba601b3
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/device/gemm_universal_base_compat.h"
+#include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h"
+#include "cutlass_extensions/gemm/kernel/fpA_intB_gemm.h"
+#include "cutlass_extensions/gemm/threadblock/default_mma.h"
+#include "cutlass_extensions/gemm_configs.h"
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h"
+
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape, int Stages>
+void generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+#ifdef ENABLE_BF16
+  static_assert(
+#ifdef ENABLE_FP8
+      cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value ||
+#endif
+          cutlass::platform::is_same<ActivationType, __nv_bfloat16>::value ||
+          cutlass::platform::is_same<ActivationType, half>::value ||
+          cutlass::platform::is_same<ActivationType, float>::value,
+      "Specialized for bfloat16, half, float");
+#else
+  static_assert(cutlass::platform::is_same<ActivationType, half>::value ||
+                    cutlass::platform::is_same<ActivationType, float>::value,
+                "Specialized for half, float");
+#endif
+
+  static_assert(cutlass::platform::is_same<ActivationType, WeightType>::value ||
+                    cutlass::platform::is_same<WeightType, uint8_t>::value ||
+                    cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value,
+                "");
+
+  // The cutlass type for the input elements. This is needed to convert to cutlass::half_t if
+  // necessary.
+  using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
+  using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
+  using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter<ScaleZeroType>::type;
+  using CutlassBiasType = typename TllmToCutlassTypeAdapter<BiasType>::type;
+  using CutlassOutputType = typename TllmToCutlassTypeAdapter<OutputType>::type;
+
+  // We need separate config for each architecture since we will target different tensorcore
+  // instructions. For float, we do not target TCs.
+  using MixedGemmArchTraits =
+      cutlass::gemm::kernel::MixedGemmArchTraits<CutlassActivationType, CutlassWeightType, arch>;
+  using ElementAccumulator = typename MixedGemmArchTraits::AccType;
+
+  constexpr int ElementsPerAccessC = 128 / cutlass::sizeof_bits<CutlassOutputType>::value;
+  using EpilogueOp = typename tkc::Epilogue<CutlassOutputType, ElementsPerAccessC,
+                                            ElementAccumulator, EpilogueTag>::Op;
+
+  using Operator = typename MixedGemmArchTraits::Operator;
+  using TaggedOperator = typename cutlass::arch::TagOperator<Operator, QuantOp>::TaggedOperator;
+
+  using GemmKernel_ = typename cutlass::gemm::kernel::DefaultGemm<
+      CutlassActivationType, cutlass::layout::RowMajor, MixedGemmArchTraits::ElementsPerAccessA,
+      CutlassWeightType, typename MixedGemmArchTraits::LayoutB,
+      MixedGemmArchTraits::ElementsPerAccessB, CutlassOutputType, cutlass::layout::RowMajor,
+      ElementAccumulator, cutlass::arch::OpClassTensorOp, arch, ThreadblockShape, WarpShape,
+      typename MixedGemmArchTraits::InstructionShape, EpilogueOp,
+      typename cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, Stages, true,
+      TaggedOperator>::GemmKernel;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmFpAIntB<typename GemmKernel_::Mma, typename GemmKernel_::Epilogue,
+                                         typename GemmKernel_::ThreadblockSwizzle,
+                                         arch,  // Ensure top level arch is used for dispatch
+                                         GemmKernel_::kSplitKSerial>;
+
+  if (occupancy != nullptr) {
+    *occupancy = tensorrt_llm::cutlass_extensions::compute_occupancy_for_kernel<GemmKernel>();
+    return;
+  }
+
+  using Gemm = cutlass::gemm::device::GemmUniversalBaseCompat<GemmKernel>;
+
+  int const ldb = cutlass::platform::is_same<cutlass::layout::RowMajor,
+                                             typename MixedGemmArchTraits::LayoutB>::value
+                      ? n
+                      : k * GemmKernel::kInterleave;
+
+  if (weight_scales == nullptr) {
+    throw std::runtime_error("Weight scales must always be set to a non-null value.");
+  }
+
+  if constexpr (cutlass::isFinegrained(QuantOp)) {
+    if constexpr (cutlass::platform::is_same<CutlassActivationType, float_e4m3_t>::value) {
+      if (group_size != 128) {
+        throw std::runtime_error(
+            "Only group size 128 supported for fine grained W4A(fp)8 kernels.");
+      }
+    }
+    if (group_size != 64 && group_size != 128) {
+      throw std::runtime_error("Only group size 64 and 128 supported for fine grained kernels.");
+    }
+
+    if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) {
+      if (weight_zero_points != nullptr) {
+        throw std::runtime_error(
+            "Weight zero pointer must be a nullptr for scale only fine grained");
+      }
+    } else if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) {
+      if (weight_zero_points == nullptr) {
+        throw std::runtime_error(
+            "Weight zero pointer must be valid for scale and bias fine grained");
+      }
+    }
+  } else {
+    if (group_size != k) {
+      throw std::runtime_error("Invalid group size for per column scaling kernels.");
+    }
+
+    if (weight_zero_points != nullptr) {
+      throw std::runtime_error("Weight zero-points must be null when running per column scaling");
+    }
+  }
+
+  int const ld_scale_zero = cutlass::isFinegrained(QuantOp) ? n : 0;
+  ElementAccumulator output_op_beta =
+      (biases == nullptr) ? ElementAccumulator(0.f) : ElementAccumulator(1.f);
+  typename Gemm::Arguments args(
+      {m, n, k}, group_size,
+      {reinterpret_cast<CutlassActivationType*>(const_cast<ActivationType*>(A)), k},
+      {reinterpret_cast<CutlassWeightType*>(const_cast<WeightType*>(B)), ldb},
+      {reinterpret_cast<CutlassScaleZeroType*>(const_cast<ScaleZeroType*>(weight_scales)),
+       ld_scale_zero},
+      {reinterpret_cast<CutlassScaleZeroType*>(const_cast<ScaleZeroType*>(weight_zero_points)),
+       ld_scale_zero},
+      {reinterpret_cast<CutlassBiasType*>(const_cast<BiasType*>(biases)), 0},
+      {reinterpret_cast<CutlassOutputType*>(C), n}, gemm_config.split_k_factor,
+      {ElementAccumulator(alpha), output_op_beta});
+
+  // This assertion is enabled because because for the column interleaved layout, K MUST be a
+  // multiple of threadblockK. The reason for this is that the default pitchlinear iterators are
+  // used to handle walking over the interleaved matrix. The way masking in handled in these do not
+  // map to the interleaved layout. We need to write our own predicated iterator in order to relax
+  // this limitation.
+  if (GemmKernel::kInterleave > 1 &&
+      ((k % MixedGemmArchTraits::ThreadblockK) ||
+       ((k / gemm_config.split_k_factor) % MixedGemmArchTraits::ThreadblockK))) {
+    throw std::runtime_error("Temp assertion: k must be multiple of threadblockK");
+  }
+
+  Gemm gemm;
+  if (gemm.get_workspace_size(args) > workspace_bytes) {
+    TLLM_LOG_WARNING(
+        "Requested split-k but workspace size insufficient. Falling back to non-split-k "
+        "implementation.");
+    // If requested split-k factor will require more workspace bytes, revert to standard gemm.
+    args.batch_count = 1;
+  }
+
+  auto can_implement = gemm.can_implement(args);
+  if (can_implement != cutlass::Status::kSuccess) {
+    std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
+                          std::string(cutlassGetStatusString(can_implement));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+
+  auto init_status = gemm.initialize(args, workspace, stream);
+  if (init_status != cutlass::Status::kSuccess) {
+    std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
+                          std::string(cutlassGetStatusString(init_status));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+
+  auto run_status = gemm.run(stream);
+  if (run_status != cutlass::Status::kSuccess) {
+    std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
+                          std::string(cutlassGetStatusString(run_status));
+    throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+  }
+}
+
+// This filters out invalid template combinations that we DON'T want instantiated in CUTLASS. For
+// example, instantiating SM=75, Stages=3 is invalid so we would need to filter that out. Fine
+// grained quanitzation is only supported on Ampere+ GPUs. FP8 GEMM is only supported on Ada+ GPUs.
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape, int Stages>
+void filter_and_run_mixed_gemm(ActivationType const* A, WeightType const* B,
+                               ScaleZeroType const* weight_scales,
+                               ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                               float const alpha, OutputType* C, int m, int n, int k,
+                               int const group_size, tkc::CutlassGemmConfig gemm_config,
+                               char* workspace, size_t workspace_bytes, cudaStream_t stream,
+                               int* occupancy = nullptr) {
+  if constexpr (Stages > 2 && arch::kMinComputeCapability < 80) {
+    // Multistage only supported on Ampere
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) + " with stages set to " +
+                          std::to_string(Stages);
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else if constexpr (Stages == 2 && arch::kMinComputeCapability >= 89) {
+    // Multistage only supported on Ampere
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) + " with stages set to " +
+                          std::to_string(Stages);
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value &&
+                       arch::kMinComputeCapability < 89) {
+    // FP8 activation type only supported on Ada+ GPUs
+    std::string err_msg = "Cutlass fpA_intB gemm not supported for arch " +
+                          std::to_string(arch::kMinComputeCapability) +
+                          " with activation type set to FP8";
+    throw std::runtime_error("[TensorRT-LLm Error][filter_and_run_mixed_gemm] " + err_msg);
+  } else {
+    generic_mixed_gemm_kernelLauncher<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                      OutputType, arch, QuantOp, EpilogueTag, ThreadblockShape,
+                                      WarpShape, Stages>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size, gemm_config,
+        workspace, workspace_bytes, stream, occupancy);
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag, typename ThreadblockShape, typename WarpShape>
+void dispatch_gemm_config(ActivationType const* A, WeightType const* B,
+                          ScaleZeroType const* weight_scales,
+                          ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                          float const alpha, OutputType* C, int m, int n, int k,
+                          int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace,
+                          size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  switch (gemm_config.stages) {
+    case 2:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 2>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case 3:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 3>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case 4:
+      filter_and_run_mixed_gemm<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                arch, QuantOp, EpilogueTag, ThreadblockShape, WarpShape, 4>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      std::string err_msg =
+          "dispatch_gemm_config does not support stages " + std::to_string(gemm_config.stages);
+      throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_config] " + err_msg);
+      break;
+  }
+}
+
+template <typename T>
+constexpr bool is_fp8() {
+  return std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>;
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, typename arch, cutlass::WeightOnlyQuantOp QuantOp,
+          typename EpilogueTag>
+void dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
+                              ScaleZeroType const* weight_scales,
+                              ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                              float const alpha, OutputType* C, int m, int n, int k,
+                              int const group_size, char* workspace, size_t workspace_bytes,
+                              tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
+                              int* occupancy = nullptr) {
+  // Don't instantiate configs that are not supported pre-hopper. Produce a sensible error instead.
+  constexpr bool any_is_fp8 = is_fp8<ActivationType>() || is_fp8<WeightType>() ||
+                              is_fp8<ScaleZeroType>() || is_fp8<BiasType>() || is_fp8<OutputType>();
+
+  constexpr bool all_types_are_the_same = std::is_same_v<ActivationType, ScaleZeroType> &&
+                                          std::is_same_v<ActivationType, BiasType> &&
+                                          std::is_same_v<ActivationType, OutputType>;
+
+  constexpr bool is_valid_pre_hopper =
+      (all_types_are_the_same && !any_is_fp8) || (arch::kMinComputeCapability == 89);
+
+  if constexpr (is_valid_pre_hopper) {
+    // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
+    // We also only instantiate configs here where threadblockShapeM == warpShapeM since those
+    // usually perform the best for mixed type gemms.
+    constexpr int tile_shape_k = 128 * 8 / cutlass::sizeof_bits<ActivationType>::value;
+    switch (gemm_config.tile_config_sm80) {
+      case tkc::CutlassTileConfig::CtaShape16x128x64_WarpShape16x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<16, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<16, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape16x256x64_WarpShape16x64x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<16, 256, tile_shape_k>,
+                             cutlass::gemm::GemmShape<16, 64, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<32, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<32, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<64, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<64, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64:
+        dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, arch,
+                             QuantOp, EpilogueTag, cutlass::gemm::GemmShape<128, 128, tile_shape_k>,
+                             cutlass::gemm::GemmShape<128, 32, tile_shape_k>>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      case tkc::CutlassTileConfig::Undefined:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
+        break;
+      case tkc::CutlassTileConfig::ChooseWithHeuristic:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
+            "already been set by "
+            "heuristic.");
+        break;
+      default:
+        throw std::runtime_error(
+            "[TensorRT-LLm Error][fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
+            "type GEMM.");
+        break;
+    }
+  } else {
+    // This is not a limitation in CUTLASS. We just do not need to support this case.
+    std::string err_msg =
+        "The activation type must equal the scale, bias and output types on Ampere and earlier.";
+    throw std::runtime_error("[TensorRT-LLm Error][dispatch_gemm_to_cutlass] " + err_msg);
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                         OutputType>::CutlassFpAIntBGemmRunner() {
+  int device{-1};
+  tk::check_cuda_error(cudaGetDevice(&device));
+  sm_ = tk::getSMVersion();
+  tk::check_cuda_error(
+      cudaDeviceGetAttribute(&multi_processor_count_, cudaDevAttrMultiProcessorCount, device));
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                         OutputType>::~CutlassFpAIntBGemmRunner() {}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+template <typename EpilogueTag>
+void CutlassFpAIntBGemmRunner<
+    ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+    OutputType>::dispatch_to_arch<EpilogueTag>(ActivationType const* A, WeightType const* B,
+                                               ScaleZeroType const* weight_scales,
+                                               ScaleZeroType const* weight_zero_points,
+                                               BiasType const* biases, float const alpha,
+                                               OutputType* C, int m, int n, int k,
+                                               int const group_size,
+                                               tkc::CutlassGemmConfig gemm_config,
+                                               char* workspace_ptr, const size_t workspace_bytes,
+                                               cudaStream_t stream, int* occupancy) {
+  if (sm_ >= 75 && sm_ < 80) {
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm75, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if ((sm_ >= 80 && sm_ < 89) || sm_ >= 100) {
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm80, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if (sm_ == 89) {
+#if ENABLE_FP8 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4))
+    if constexpr (cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value) {
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] INT4xFP8 GEMM for Ada "
+          "needs "
+          "CUDA>=12.4");
+    }
+#endif
+    dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                             cutlass::arch::Sm89, QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else if (sm_ == 90) {
+    static_assert(!cutlass::platform::is_same<ActivationType, __nv_fp8_e4m3>::value ||
+                      cutlass::platform::is_same<ScaleZeroType, half>::value,
+                  "ScaleZeroType must be half for activation=fp8");
+    sm90_dispatch_gemm_to_cutlass<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                  QuantOp, EpilogueTag>(
+        A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+        workspace_ptr, workspace_bytes, gemm_config, stream, occupancy);
+  } else {
+    throw std::runtime_error(
+        "[TensorRT-LLM Error][CutlassFpAIntBGemmRunner][dispatch_to_arch] Arch unsupported for "
+        "CUTLASS mixed type "
+        "GEMM");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales,
+                                                void const* weight_zero_points, void const* biases,
+                                                float const alpha, void* C, int m, int n, int k,
+                                                int const group_size,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  if constexpr ((QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) ||
+                (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY)) {
+    dispatch_to_arch<tkc::EpilogueOpBias>(
+        (ActivationType const*)A, (WeightType const*)B, (ScaleZeroType const*)weight_scales,
+        (ScaleZeroType const*)weight_zero_points, (BiasType const*)biases, alpha, (OutputType*)C, m,
+        n, k, group_size, gemmConfig, workspace_ptr, workspace_bytes, stream, nullptr);
+  } else {
+    throw std::runtime_error(
+        "Overload with scale, zero and group size only supported for fine grained bias template.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales,
+                                                void const* weight_zero_points, void const* biases,
+                                                void* C, int m, int n, int k, int const group_size,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  gemm(A, B, weight_scales, weight_zero_points, biases, 1.f, C, m, n, k, group_size, gemmConfig,
+       workspace_ptr, workspace_bytes, stream);
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales, float const alpha,
+                                                void* C, int m, int n, int k,
+                                                tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::PER_COLUMN_SCALE_ONLY) {
+    dispatch_to_arch<tkc::EpilogueOpBias>((ActivationType const*)A, (WeightType const*)B,
+                                          (ScaleZeroType const*)weight_scales, nullptr, nullptr,
+                                          alpha, (OutputType*)C, m, n, k, k, gemmConfig,
+                                          workspace_ptr, workspace_bytes, stream, nullptr);
+  } else {
+    throw std::runtime_error(
+        "Overload with scale only (and no group size) only supported for per column scaling.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+void CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                              OutputType>::gemm(void const* A, void const* B,
+                                                void const* weight_scales, void* C, int m, int n,
+                                                int k, tkc::CutlassGemmConfig gemmConfig,
+                                                char* workspace_ptr, const size_t workspace_bytes,
+                                                cudaStream_t stream) {
+  gemm(A, B, weight_scales, 1.f, C, m, n, k, gemmConfig, workspace_ptr, workspace_bytes, stream);
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+std::vector<tkc::CutlassGemmConfig> CutlassFpAIntBGemmRunner<
+    ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType, OutputType>::getConfigs() const {
+  static constexpr bool is_weight_only = !std::is_same<ActivationType, WeightType>::value;
+  tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param =
+      tkc::CutlassGemmConfig::CandidateConfigTypeParam::HOPPER;
+  if (is_weight_only) {
+    config_type_param = static_cast<tkc::CutlassGemmConfig::CandidateConfigTypeParam>(
+        config_type_param | tkc::CutlassGemmConfig::CandidateConfigTypeParam::WEIGHT_ONLY);
+  }
+  std::vector<tkc::CutlassGemmConfig> candidateConfigs =
+      get_candidate_configs(sm_, SPLIT_K_LIMIT, config_type_param);
+  return candidateConfigs;
+}
+
+template <typename ActivationType, typename WeightType, cutlass::WeightOnlyQuantOp QuantOp,
+          typename ScaleZeroType, typename BiasType, typename OutputType>
+size_t CutlassFpAIntBGemmRunner<ActivationType, WeightType, QuantOp, ScaleZeroType, BiasType,
+                                OutputType>::getWorkspaceSize(int const m, int const n,
+                                                              int const k) {
+  // For Hopper, we have to allocate large memory size in case for stream-K
+  if (sm_ == 90) {
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L878-L892
+    // The above lines says sk_tiles = output_tiles - (static_cast<uint32_t>(output_tiles /
+    // ctas_per_wave) - 1) * ctas_per_wave This means sk_tiles is at most 2 * ctas_per_wave, which
+    // is 2 * multi_processor_count_
+    int const max_sk_tiles = 2 * multi_processor_count_;
+
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L939
+    // The above line says uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    // That means sk_units is at most ctas_per_sk_wave, which is multi_processor_count_
+    int const max_sk_units = multi_processor_count_;
+
+    // https://github.com/NVIDIA/cutlass/blob/19b4c5e065e7e5bbc8082dfc7dbd792bdac850fc/include/cutlass/gemm/kernel/tile_scheduler_params.h#L505
+    // The above lines scales sk_tiles by the factor of static_cast<uint32_t>(sk_units / sk_tiles +
+    // 2) That means the final sk_tiles is at most 2 * max_sk_tiles + max_sk_units;
+    int const max_sk_tiles_with_seperate_reduction = 2 * max_sk_tiles + max_sk_units;
+
+    return static_cast<size_t>(max_sk_tiles_with_seperate_reduction * MAX_M_TILE_SM90 *
+                               MAX_N_TILE_SM90 * sizeof(float));
+  }
+  // These are the min tile sizes for each config, which would launch the maximum number of blocks
+  int const max_grid_m = cutlass::ceil_div(m, MIN_M_TILE);
+  int const max_grid_n = cutlass::ceil_div(n, MIN_N_TILE);
+  // We need 4 bytes per block in the worst case. We launch split_k_limit in z dim.
+  return static_cast<size_t>(max_grid_m * max_grid_n * SPLIT_K_LIMIT * 4);
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
new file mode 100644
index 000000000..a81fffde9
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cute/numeric/integral_constant.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+using namespace cute;
+
+// This filters out invalid template combinations that we DON'T want instantiated in CUTLASS. For
+// example, instantiating SM=75, Stages=3 is invalid so we would need to filter that out. Fine
+// grained quanitzation is only supported on Ampere+ GPUs.
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType>
+void sm90_dispatch_epilogue_schedules(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  switch (gemm_config.epilogue_schedule) {
+    case tkc::EpilogueScheduleType::AUTO:
+      using EpilogueScheduleType =
+          cute::conditional_t<size<0>(CTAShape{}) == Int<64>{},
+                              cutlass::epilogue::TmaWarpSpecialized,
+                              cutlass::epilogue::TmaWarpSpecializedCooperative>;
+      sm90_generic_mixed_gemm_kernelLauncher<
+          ActivationType, WeightType, ScaleZeroType, BiasType, OutputType, QuantOp, EpilogueTag,
+          CTAShape, ClusterShape, MainloopScheduleType, EpilogueScheduleType>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_epilogue_schedules] epilogue schedule "
+          "config is invalid for "
+          "mixed "
+          "type GEMM.");
+      break;
+  }
+}
+
+/*
+    1x1x1 cluster shape is are supported for any tile shape.
+
+    2x1x1 cluster shape is only supported for when the M tile is at least 128.
+
+    1x2x1 cluster shape is only supported when the N tile is at least 128.
+
+    2x2x1 cluster shape is only supported when both the M and N tiles are at least 128.
+
+    We make the above restrictions to improve compilation speed in TRT-LLM, by pruning kernels
+    that may not be very useful in practice.
+ */
+template <typename CTAShape, typename ClusterShape>
+constexpr bool are_tile_shapes_supported() {
+  [[maybe_unused]] constexpr int cta_m = get<0>(CTAShape{});
+  [[maybe_unused]] constexpr int cta_n = get<1>(CTAShape{});
+  constexpr int cga_m = get<0>(ClusterShape{});
+  constexpr int cga_n = get<1>(ClusterShape{});
+
+  if constexpr (cga_m == _1{} && cga_n == _1{}) {
+    return true;
+  } else if constexpr (cga_m == _2{} && cga_n == _1{} && cta_m >= _128{}) {
+    return true;
+  } else if constexpr (cga_m == _1{} && cga_n == _2{} && cta_n >= _128{}) {
+    return true;
+  } else if constexpr (cga_m == _2{} && cga_n == _2{} && cta_m >= _128{} && cta_n >= _128{}) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape>
+void sm90_dispatch_mainloop_schedules(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr) {
+  constexpr bool tile_shapes_supported = are_tile_shapes_supported<CTAShape, ClusterShape>();
+
+  if constexpr (tile_shapes_supported) {
+    switch (gemm_config.mainloop_schedule) {
+      case tkc::MainloopScheduleType::AUTO:
+        using KernelScheduleType =
+            cute::conditional_t<size<0>(CTAShape{}) == Int<64>{},
+                                cutlass::gemm::KernelTmaWarpSpecializedPingpong,
+                                cutlass::gemm::KernelTmaWarpSpecializedCooperative>;
+        sm90_dispatch_epilogue_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                         OutputType, QuantOp, EpilogueTag, CTAShape, ClusterShape,
+                                         KernelScheduleType>(
+            A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+            gemm_config, workspace, workspace_bytes, stream, occupancy);
+        break;
+      default:
+        throw std::runtime_error(
+            "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] mainloop schedule "
+            "config is invalid "
+            "for "
+            "mixed type GEMM.");
+        break;
+    }
+  } else {
+    throw std::runtime_error(
+        "[TensorRT-LLM Error][fpA_intB][sm90_dispatch_mainloop_schedules] Unsupported CTA and "
+        "Cluster shapes for "
+        "mixed type GEMM.");
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape>
+void sm90_dispatch_gemm_config(ActivationType const* A, WeightType const* B,
+                               ScaleZeroType const* weight_scales,
+                               ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                               float const alpha, OutputType* C, int m, int n, int k,
+                               int const group_size, tkc::CutlassGemmConfig gemm_config,
+                               char* workspace, size_t workspace_bytes, cudaStream_t stream,
+                               int* occupancy = nullptr) {
+  switch (gemm_config.cluster_shape) {
+    case tkc::ClusterShape::ClusterShape_1x1x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_1, _1, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_2x1x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_2, _1, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_1x2x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_1, _2, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::ClusterShape::ClusterShape_2x2x1:
+      sm90_dispatch_mainloop_schedules<ActivationType, WeightType, ScaleZeroType, BiasType,
+                                       OutputType, QuantOp, EpilogueTag, CTAShape,
+                                       Shape<_2, _2, _1>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLM Error][fpA_intB][dispatch_CGA_config] Config is invalid for mixed type "
+          "GEMM.");
+      break;
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag>
+void sm90_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B,
+                                   ScaleZeroType const* weight_scales,
+                                   ScaleZeroType const* weight_zero_points, BiasType const* biases,
+                                   float const alpha, OutputType* C, int m, int n, int k,
+                                   int const group_size, char* workspace, size_t workspace_bytes,
+                                   tkc::CutlassGemmConfig gemm_config, cudaStream_t stream,
+                                   int* occupancy = nullptr) {
+  // Note that SIMT configs are omitted here since they are not supported for fpA_intB.
+  // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
+  // perform the best for mixed type gemms.
+
+  constexpr int Ktile = 128 / sizeof(ActivationType);
+  using _Ktile = Int<Ktile>;
+  switch (gemm_config.tile_config_sm90) {
+    case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _16, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x32x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _32, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x64x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _64, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x128x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _128, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape64x256x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_64, _256, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x16x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _16, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x32x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _32, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x64x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _64, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x128x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _128, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::CtaShape128x256x128B:
+      sm90_dispatch_gemm_config<ActivationType, WeightType, ScaleZeroType, BiasType, OutputType,
+                                QuantOp, EpilogueTag, Shape<_128, _256, _Ktile>>(
+          A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size,
+          gemm_config, workspace, workspace_bytes, stream, occupancy);
+      break;
+    case tkc::CutlassTileConfigSM90::Undefined:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config undefined.");
+      break;
+    case tkc::CutlassTileConfigSM90::ChooseWithHeuristic:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] gemm config should have "
+          "already been set by "
+          "heuristic.");
+      break;
+    default:
+      throw std::runtime_error(
+          "[TensorRT-LLm Error][fpA_intB][sm90_dispatch_gemm_to_cutlass] Config is invalid for "
+          "mixed type GEMM.");
+      break;
+  }
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
new file mode 100644
index 000000000..6c2098e3c
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime_api.h>
+
+#include "cutlass_extensions/gemm_configs.h"
+#include "cutlass_extensions/weight_only_quant_op.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
+          typename EpilogueScheduleType>
+void sm90_generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size,
+    tensorrt_llm::cutlass_extensions::CutlassGemmConfig gemm_config, char* workspace,
+    size_t workspace_bytes, cudaStream_t stream, int* occupancy = nullptr);
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
new file mode 100644
index 000000000..052f388b8
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass_extensions/compute_occupancy.h"
+#include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/collective/collective_builder_interleaved.hpp"
+#include "cutlass_extensions/gemm_configs.h"
+
+#ifdef __GNUC__  // Check if the compiler is GCC or Clang
+#pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h"
+
+namespace tensorrt_llm {
+namespace kernels {
+namespace cutlass_kernels {
+namespace tk = tensorrt_llm::common;
+namespace tkc = tensorrt_llm::cutlass_extensions;
+
+using namespace cute;
+
+template <typename ActivationType, typename WeightType, typename ScaleZeroType, typename BiasType,
+          typename OutputType, cutlass::WeightOnlyQuantOp QuantOp, typename EpilogueTag,
+          typename CTAShape, typename ClusterShape, typename MainloopScheduleType,
+          typename EpilogueScheduleType>
+void sm90_generic_mixed_gemm_kernelLauncher(
+    ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales,
+    ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha,
+    OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config,
+    char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy) {
+#ifdef COMPILE_HOPPER_TMA_GEMMS
+  using CutlassActivationType = typename TllmToCutlassTypeAdapter<ActivationType>::type;
+
+  if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v<
+                    cutlass::arch::Sm90, CTAShape, ClusterShape, ActivationType>) {
+    using CutlassWeightType = typename TllmToCutlassTypeAdapter<WeightType>::type;
+
+    using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter<ScaleZeroType>::type;
+    using CutlassBiasType = typename TllmToCutlassTypeAdapter<BiasType>::type;
+    using CutlassOutputType = typename TllmToCutlassTypeAdapter<OutputType>::type;
+
+    static_assert(std::is_same_v<CutlassActivationType, cutlass::half_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::bfloat16_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassActivationType, cutlass::float_e5m2_t>,
+                  "Activation type must be bfloat16, half, FP8");
+
+    static_assert(std::is_same_v<CutlassWeightType, uint8_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::uint4b_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassWeightType, cutlass::float_e5m2_t>,
+                  "Weight type must be fp8, uint8_t or uint4_t");
+
+    static_assert(!std::is_same_v<CutlassActivationType, cutlass::float_e4m3_t> ||
+                      std::is_same_v<CutlassScaleZeroType, cutlass::half_t>,
+                  "Scale/Zero type must be half for fp8 activation");
+
+    using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
+    constexpr int AlignmentA = 128 / cutlass::sizeof_bits<CutlassActivationType>::value;
+
+    using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
+    constexpr int AlignmentB = 128 / cutlass::sizeof_bits<CutlassWeightType>::value;
+
+    // This example manually swaps and transposes, so keep transpose of input layouts
+    using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
+    using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
+
+    using ElementZero = CutlassScaleZeroType;
+    using ElementScale = CutlassScaleZeroType;
+
+    // C/D matrix configuration. We reuse the C operand for the bias and set the stride for
+    // broadcast.
+    using LayoutBias = cutlass::layout::RowMajor;
+    constexpr int AlignmentBias = 128 / cutlass::sizeof_bits<CutlassBiasType>::value;
+
+    // D matrix configuration
+    using LayoutOutput = cutlass::layout::RowMajor;
+    constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits<CutlassOutputType>::value;
+
+    // Core kernel configurations
+    using ElementAccumulator = float;  // Element type for internal accumulation
+    using ElementCompute = float;      // Element type for epilogue computation
+    using ArchTag =
+        cutlass::arch::Sm90;  // Tag indicating the minimum SM that supports the intended feature
+    using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
+    using TileShape = CTAShape;                            // Threadblock-level tile size
+    using KernelSchedule = MainloopScheduleType;
+    using EpilogueSchedule = EpilogueScheduleType;
+
+    // Shrink the N dimension to match CTA_N if needed
+    constexpr int epi_tile_M = cute::min(shape<0>(TileShape{}), 128);  // 64 or 128
+    constexpr int epi_tile_N =
+        cute::min(shape<1>(TileShape{}), 32);  // Allow this to be 16 for some small N tiles.
+    using EpilogueTileType = cute::Shape<cute::Int<epi_tile_M>, cute::Int<epi_tile_N>>;
+
+    static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+    static_assert(std::is_same_v<EpilogueTag, tensorrt_llm::cutlass_extensions::EpilogueOpBias>,
+                  "");
+    using EVT_bias_addition = cutlass::epilogue::fusion::Sm90EVT<
+        cutlass::epilogue::fusion::Sm90Compute<cutlass::homogeneous_multiply_add, CutlassOutputType,
+                                               ElementCompute,
+                                               RoundStyle>,                  // alpha * acc + bias
+        cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementAccumulator>,  // alpha
+        cutlass::epilogue::fusion::Sm90AccFetch,                             // acc
+        cutlass::epilogue::fusion::Sm90ColBroadcast<0, TileShape, CutlassBiasType, CutlassBiasType,
+                                                    Stride<_1, _0, _0>,
+                                                    AlignmentBias>  // bias
+        >;
+
+    using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+        ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, ElementAccumulator,
+        ElementAccumulator,
+        // Transpose layout of D here since we use the explicit swap + transpose trick
+        // Void C since we don't use it. Prevents smem allocation.
+        void, typename cutlass::layout::LayoutTranspose<LayoutBias>::type, AlignmentBias,
+        CutlassOutputType, typename cutlass::layout::LayoutTranspose<LayoutOutput>::type,
+        AlignmentOutput, EpilogueSchedule, EVT_bias_addition>::CollectiveOp;
+
+    using PackedScaleZero = cute::tuple<CutlassWeightType, ElementScale, ElementZero>;
+    using PackedScale = cute::tuple<CutlassWeightType, ElementScale>;
+    using ElementBCollectiveInfo =
+        std::conditional_t<cutlass::hasZero(QuantOp), PackedScaleZero, PackedScale>;
+
+    // We swap A and B operands to the builder here
+    using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilderInterleaved<
+        ArchTag, OperatorClass, ElementBCollectiveInfo, LayoutB_Transpose, AlignmentB,
+        CutlassActivationType, LayoutA_Transpose, AlignmentA, ElementAccumulator, TileShape,
+        ClusterShape,
+        cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+            sizeof(typename CollectiveEpilogue::SharedStorage))>,
+        KernelSchedule>::CollectiveOp;
+
+    using TileScheduler =
+        cute::conditional_t<size<0>(CTAShape{}) == Int<64>{}, cutlass::gemm::PersistentScheduler,
+                            cutlass::gemm::StreamKScheduler>;
+
+    using GemmKernel =
+        cutlass::gemm::kernel::GemmUniversal<Shape<int, int, int, int>,  // Indicates ProblemShape
+                                             CollectiveMainloop, CollectiveEpilogue, TileScheduler>;
+
+    if (occupancy != nullptr) {
+      *occupancy =
+          tensorrt_llm::cutlass_extensions::compute_occupancy_for_kernel<GemmKernel, true>();
+      return;
+    }
+
+    using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+    using StrideA = typename GemmKernel::StrideA;
+    using StrideB = typename GemmKernel::StrideB;
+    using StrideC = typename GemmKernel::StrideC;
+    using StrideD = typename GemmKernel::StrideD;
+    using StrideS = typename CollectiveMainloop::StrideScale;
+
+    if (weight_scales == nullptr) {
+      throw std::runtime_error("Weight scales must always be set to a non-null value.");
+    }
+
+    if constexpr (cutlass::isFinegrained(QuantOp)) {
+      int cta_shape_k = cute::size<2>(TileShape{});
+      if (group_size % cta_shape_k != 0) {
+        std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k);
+        throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner]" + err_msg);
+      }
+
+      if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) {
+        if (weight_zero_points != nullptr) {
+          throw std::runtime_error(
+              "Weight zero pointer must be a nullptr for scale only fine grained");
+        }
+      } else if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) {
+        if (weight_zero_points == nullptr) {
+          throw std::runtime_error(
+              "Weight zero pointer must be valid for scale and bias fine grained");
+        }
+      }
+    } else {
+      if (group_size != k) {
+        throw std::runtime_error("Invalid group size for per column scaling kernels.");
+      }
+
+      if (weight_zero_points != nullptr) {
+        throw std::runtime_error("Weight zero-points must be null when running per column scaling");
+      }
+    }
+
+    auto cutlass_scale_k = (k + group_size - 1) / group_size;
+    StrideA stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
+    StrideB stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
+    StrideD stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1));
+    StrideS stride_S =
+        cutlass::make_cute_packed_stride(StrideS{}, cute::make_shape(n, cutlass_scale_k, 1));
+
+    // Use the output as the bias to avoid making a tma descriptor with a nullptr.
+    auto output_as_bias_type = reinterpret_cast<CutlassBiasType const*>(C);
+
+    typename Gemm::Arguments args{
+        cutlass::gemm::GemmUniversalMode::kGemm,
+        {n, m, k, 1},
+        {reinterpret_cast<CutlassWeightType const*>(B), stride_B,
+         reinterpret_cast<CutlassActivationType const*>(A), stride_A,
+         reinterpret_cast<ElementScale const*>(weight_scales), stride_S, group_size,
+         reinterpret_cast<ElementZero const*>(weight_zero_points)},
+        {{}, output_as_bias_type, stride_D, reinterpret_cast<CutlassOutputType*>(C), stride_D}};
+
+    args.epilogue.thread = {
+        {alpha},                                                                   // alpha args
+        {},                                                                        // accumulator
+        {reinterpret_cast<CutlassBiasType const*>(biases), CutlassBiasType(0.f)},  // bias args
+        {}  // end multiply_add
+    };
+
+    Gemm gemm;
+    if (gemm.get_workspace_size(args) > workspace_bytes) {
+      TLLM_LOG_ERROR("[TensorRT-LLm Error][fpA_intB Runner] given workspace size insufficient.");
+    }
+
+    auto can_implement = gemm.can_implement(args);
+    if (can_implement != cutlass::Status::kSuccess) {
+      std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " +
+                            std::string(cutlassGetStatusString(can_implement));
+      std::cout << err_msg << std::endl;
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+
+    auto init_status = gemm.initialize(args, workspace, stream);
+    if (init_status != cutlass::Status::kSuccess) {
+      std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " +
+                            std::string(cutlassGetStatusString(init_status));
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+
+    auto run_status = gemm.run(stream);
+    if (run_status != cutlass::Status::kSuccess) {
+      std::string err_msg = "Failed to run cutlass fpA_intB gemm. Error: " +
+                            std::string(cutlassGetStatusString(run_status));
+      throw std::runtime_error("[TensorRT-LLm Error][fpA_intB Runner] " + err_msg);
+    }
+  } else {
+    std::stringstream ss;
+    ss << "[TensorRT-LLm Error][fpA_intB Runner] Config (" << (int64_t)cute::size<0>(CTAShape{})
+       << "," << (int64_t)cute::size<1>(CTAShape{}) << "," << (int64_t)cute::size<2>(CTAShape{})
+       << ") (" << (int64_t)cute::size<0>(ClusterShape{}) << ","
+       << (int64_t)cute::size<1>(ClusterShape{}) << "," << (int64_t)cute::size<2>(ClusterShape{})
+       << ") not compiled with FAST_BUILD.";
+
+    throw std::runtime_error(ss.str());
+  }
+
+#else   // COMPILE_HOPPER_TMA_GEMMS
+  throw std::runtime_error(
+      "[TensorRT-LLm Error][fpA_intB Runner] Please recompile with support for hopper by passing "
+      "90-real as an arch "
+      "to build_wheel.py.");
+#endif  // COMPILE_HOPPER_TMA_GEMMS
+}
+
+}  // namespace cutlass_kernels
+}  // namespace kernels
+}  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
index 3c3a70662..2f49cea35 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/common.h
@@ -19,6 +19,15 @@
 namespace tensorrt_llm::kernels::cutlass_kernels {
 
 // Note update moe.py to match
-enum class ActivationType { Gelu = 0, Relu, Silu, Swiglu, Geglu, Identity, InvalidType };
+enum class ActivationType {
+  Gelu = 0,
+  Relu,
+  Silu,
+  Swiglu,
+  Geglu,
+  SwigluBias,
+  Identity,
+  InvalidType
+};
 
 }  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
index b2363ccfa..66d4ae238 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h
@@ -203,8 +203,10 @@ struct TmaWarpSpecializedGroupedGemmInput {
   FpXBlockScalingType fpX_block_scaling_type = FpXBlockScalingType::NONE;
 
   struct INT4GroupwiseParams {
-    constexpr static int group_size = 128;  // Unused, hard-coded to 128
+    constexpr static int int4_group_size = 128;
+    constexpr static int wfp4a16_group_size = 32;
     bool enabled = false;
+    bool use_wfp4a16 = false;
     using SFA = __nv_bfloat16;
     using SFB = __nv_bfloat16;  // Unused
     using ProblemShapeInt = cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
@@ -247,7 +249,8 @@ struct TmaWarpSpecializedGroupedGemmInput {
 };
 
 constexpr bool isGatedActivation(ActivationType activation_type) {
-  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu;
+  return activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu ||
+         activation_type == ActivationType::SwigluBias;
 }
 
 template <typename T,                         /*The type used for activations/scales/compute*/
@@ -259,6 +262,14 @@ class MoeGemmRunner {
  public:
   MoeGemmRunner();
 
+#if defined(ENABLE_BF16)
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> &&
+                                      (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+  static constexpr bool use_wfp4a16 =
+      std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
+
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
       (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
@@ -274,6 +285,7 @@ class MoeGemmRunner {
   static constexpr bool use_w4afp8 = false;
   static constexpr bool use_wfp4afp4 = false;
 #endif
+  static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 
 #if defined(ENABLE_FP4)
   static constexpr bool use_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
@@ -300,8 +312,9 @@ class MoeGemmRunner {
   [[nodiscard]] bool isTmaWarpSpecialized(cutlass_extensions::CutlassGemmConfig gemm_config) const;
   [[nodiscard]] bool supportsTmaWarpSpecialized() const;
   [[nodiscard]] bool isFusedGatedActivation(cutlass_extensions::CutlassGemmConfig gemm_config,
-                                            bool is_gated_activation, int gemm_n, int gemm_k) const;
-  [[nodiscard]] bool supportsFusedGatedActivation(bool is_gated_activation, int gemm_n,
+                                            ActivationType activation_type, int gemm_n,
+                                            int gemm_k) const;
+  [[nodiscard]] bool supportsFusedGatedActivation(ActivationType activation_type, int gemm_n,
                                                   int gemm_k) const;
 
   size_t getMaxWorkspaceSize(int num_experts) const;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index 43f3eac2f..04eb225d9 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -24,7 +24,6 @@
 #ifdef ENABLE_FP4
 #include <cuda_fp4.h>
 #endif
-
 #include <cuda_runtime_api.h>
 
 #include <array>
@@ -86,6 +85,54 @@ struct LoraParams {
 };
 
 namespace cutlass_kernels {
+static inline size_t pad_to_multiple_of_16(size_t const& input) {
+  static constexpr int ALIGNMENT = 16;
+  return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT);
+}
+
+class CubKeyValueSorter {
+ public:
+  CubKeyValueSorter();
+
+  CubKeyValueSorter(int const num_experts_per_node);
+
+  void updateNumExperts(int const num_experts_per_node);
+
+  static size_t getWorkspaceSize(size_t const num_key_value_pairs, int const num_experts_per_node);
+
+  void run(void* workspace, size_t const workspace_size, int const* keys_in, int* keys_out,
+           int const* values_in, int* values_out, size_t const num_key_value_pairs,
+           cudaStream_t stream);
+
+ private:
+  static int expertsToBits(int experts);
+  int num_experts_;
+  int num_bits_;
+};
+
+struct ActivationParams {
+  ActivationType activation_type;
+  float const* swiglu_alpha = nullptr;
+  float const* swiglu_beta = nullptr;
+  float const* swiglu_limit = nullptr;
+
+  explicit ActivationParams(ActivationType activation_type) : activation_type(activation_type) {
+    TLLM_CHECK_WITH_INFO(
+        activation_type != ActivationType::SwigluBias,
+        "SwigluBias is not supported in ActivationParams without swiglu_alpha and swiglu_beta");
+  }
+
+  ActivationParams(ActivationType activation_type, float const* swiglu_alpha,
+                   float const* swiglu_beta, float const* swiglu_limit)
+      : activation_type(activation_type),
+        swiglu_alpha(swiglu_alpha),
+        swiglu_beta(swiglu_beta),
+        swiglu_limit(swiglu_limit) {}
+
+  // TODO Port everything properly and get rid of these implicit conversions
+  operator ActivationType() const { return activation_type; }
+};
+
 /**
  * \brief Describes what parallelism mode the MoE is using
  *
@@ -384,7 +431,7 @@ class CutlassMoeFCRunnerInterface {
   virtual void runMoe(void const* input_activations, void const* input_sf,
                       int const* token_selected_experts, float const* token_final_scales,
                       void const* fc1_expert_weights, void const* fc1_expert_biases,
-                      ActivationType fc1_activation_type, void const* fc2_expert_weights,
+                      ActivationParams fc1_activation_type, void const* fc2_expert_weights,
                       void const* fc2_expert_biases, QuantParams quant_params,
                       int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size,
                       int const num_experts, int const experts_per_token, char* workspace_ptr,
@@ -406,7 +453,7 @@ class CutlassMoeFCRunnerInterface {
                      QuantParams quant_params, int64_t const num_rows,
                      int64_t const expanded_num_rows, int64_t const hidden_size,
                      int64_t const inter_size, int const num_experts_per_node,
-                     ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+                     ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
                      bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
                      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
                      int* num_active_experts_per, int* active_expert_global_ids,
@@ -479,6 +526,15 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
       tensorrt_llm::kernels::fp8_blockscale_gemm::CutlassFp8BlockScaleGemmRunnerInterface;
   using ScaleBiasType = BackBoneType;
   using Self = CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType>;
+
+#if defined(ENABLE_BF16)
+  static constexpr bool use_wfp4a16 = std::is_same_v<WeightType, __nv_fp4_e2m1> &&
+                                      (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>);
+#else
+  static constexpr bool use_wfp4a16 =
+      std::is_same_v<WeightType, __nv_fp4_e2m1> && std::is_same_v<T, half>;
+#endif
+
 #if defined(ENABLE_FP8)
   static constexpr bool use_fp8 =
       (std::is_same_v<T, __nv_fp8_e4m3> || std::is_same_v<T, __nv_fp8_e5m2>) &&
@@ -493,6 +549,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   static constexpr bool use_fp8 = false;
   static constexpr bool use_w4afp8 = false;
 #endif
+  static constexpr bool use_w4_groupwise = use_w4afp8 || use_wfp4a16;
 #if defined(ENABLE_FP4)
   static constexpr bool act_fp4 = std::is_same_v<T, __nv_fp4_e2m1>;
   static constexpr bool weight_fp4 = std::is_same_v<WeightType, __nv_fp4_e2m1>;
@@ -552,7 +609,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
   void runMoe(void const* input_activations, void const* input_sf,
               int const* token_selected_experts, float const* token_final_scales,
               void const* fc1_expert_weights, void const* fc1_expert_biases,
-              ActivationType fc1_activation_type, void const* fc2_expert_weights,
+              ActivationParams fc1_activation_type, void const* fc2_expert_weights,
               void const* fc2_expert_biases, QuantParams quant_params, int64_t const num_rows,
               int64_t const hidden_size, int64_t const inter_size, int const num_experts,
               int const experts_per_token, char* workspace_ptr, void* final_output,
@@ -563,27 +620,30 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
               cudaStream_t stream) override;
 
   // We make these GEMM1 & GEMM2 static because they need to be stateless for the profiler to work
-  static void gemm1(
-      MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
-      // This argument must not be null if fp8 block scaling is being used.
-      // The gemm_runner will be ignored in that case. NOTE: it would
-      // be great if we could consolidate gemm_runner and fp8_blockscale_gemm_runner.
-      // For now, they don't share the same interface, so we just use two separate
-      // arguments.
-      DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input,
-      T* const output, void* const intermediate_result,
-      int64_t const* const expert_first_token_offset,
-      TmaWarpSpecializedGroupedGemmInput const tma_ws_input_template,
-      WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases,
-      int64_t const* const num_valid_tokens_ptr, ScaleBiasType const* const fc1_int_scales,
-      float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant,
-      TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
-      TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params,
-      int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size,
-      int64_t const inter_size, int const num_experts_per_node, ActivationType fc1_activation_type,
-      float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream,
-      cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
-      int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl);
+  static void gemm1(MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
+                    // This argument must not be null if fp8 block scaling is being used.
+                    // The gemm_runner will be ignored in that case. NOTE: it would
+                    // be great if we could consolidate gemm_runner and fp8_blockscale_gemm_runner.
+                    // For now, they don't share the same interface, so we just use two separate
+                    // arguments.
+                    DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input,
+                    T* const output, void* const intermediate_result,
+                    int64_t const* const expert_first_token_offset,
+                    TmaWarpSpecializedGroupedGemmInput const tma_ws_input_template,
+                    WeightType const* const fc1_expert_weights,
+                    ScaleBiasType const* const fc1_expert_biases,
+                    int64_t const* const num_valid_tokens_ptr,
+                    ScaleBiasType const* const fc1_int_scales, float const* const fc1_fp8_dequant,
+                    float const* const fc2_fp8_quant,
+                    TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat,
+                    TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat,
+                    QuantParams quant_params, int64_t const num_rows,
+                    int64_t const expanded_num_rows, int64_t const hidden_size,
+                    int64_t const inter_size, int const num_experts_per_node,
+                    ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
+                    bool bias_is_broadcast, cudaStream_t stream,
+                    cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
+                    int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl);
 
   static void gemm2(
       MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>& gemm_runner,
@@ -617,7 +677,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
              TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat,
              QuantParams quant_params, int64_t const num_rows, int64_t const expanded_num_rows,
              int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node,
-             ActivationType fc1_activation_type, float const** alpha_scale_ptr_array,
+             ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array,
              bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream,
              cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode,
              int* num_active_experts_per, int* active_expert_global_ids, bool enable_pdl) override {
@@ -724,7 +784,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
  private:
   std::pair<TmaWarpSpecializedGroupedGemmInput, TmaWarpSpecializedGroupedGemmInput>
   setupTmaWarpSpecializedInputs(int64_t num_rows, int64_t expanded_num_rows,
-                                ActivationType fc1_activation_type, int64_t hidden_size,
+                                ActivationParams fc1_activation_type, int64_t hidden_size,
                                 int64_t inter_size, int64_t num_experts_per_node,
                                 void const* input_activations_void,
                                 TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
@@ -781,7 +841,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
 
   bool mayHaveFinalizeFused() const {
     return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() == 90 &&
-           !use_deterministic_hopper_reduce_ && !use_w4afp8;
+           !use_deterministic_hopper_reduce_ && !use_w4_groupwise;
   }
 
   // TODO: This should eventually take the quant params to give more flexibility
@@ -816,7 +876,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface {
                             float const* const fc2_fp8_quant, int64_t const num_rows,
                             int64_t const expanded_num_rows, int64_t const hidden_size,
                             int64_t const inter_size, int const num_experts_per_node,
-                            ActivationType fc1_activation_type, QuantParams& quant_params,
+                            ActivationParams fc1_activation_type, QuantParams& quant_params,
                             bool enable_pdl, cudaStream_t stream);
 
   static void BlockScaleFC2(
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
index 56bf35b79..e28cb7b12 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl
@@ -37,8 +37,8 @@
 #include "cutlass/util/tensor_view_io.h"
 #include "cutlass_extensions/compute_occupancy.h"
 #include "cutlass_extensions/epilogue_helpers.h"
+#include "cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
 #include "cutlass_extensions/gemm_configs.h"
-#include "internal_cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp"
 
 #ifdef __GNUC__  // Check if the compiler is GCC or Clang
 #pragma GCC diagnostic pop
@@ -66,23 +66,21 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_generic_mixed_moe_gemm_kernelLauncher(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
-
   /////////////////////////////////////////////////////////////////////////////////////////////////
   /// GEMM kernel configurations
   /////////////////////////////////////////////////////////////////////////////////////////////////
 
   // A matrix configuration
-  // using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
-  using ElementA = cutlass::float_e4m3_t;
+  using ElementA = typename TllmToCutlassTypeAdapter<T>::type;
   using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
   constexpr int AlignmentA =
       128 / cutlass::sizeof_bits<ElementA>::value;  // Alignment of A matrix in units of elements
                                                     // (up to 16 bytes)
 
   // B matrix configuration
-  // using ElementB = typename TllmToCutlassTypeAdapter<WeightType>::type;
-  using ElementB = typename cutlass::int4b_t;
+  using ElementB_ = typename TllmToCutlassTypeAdapter<WeightType>::type;
+  using ElementB = std::conditional_t<std::is_same_v<WeightType, cutlass::uint4b_t>,
+                                      cutlass::int4b_t, ElementB_>;
   using LayoutB = cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
   constexpr int AlignmentB =
       128 / cutlass::sizeof_bits<ElementB>::value;  // Memory access granularity/alignment of B
@@ -97,9 +95,14 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   using StrideB = cute::remove_pointer_t<cutlass::detail::TagToStrideB_t<LayoutB*>>;
 
   // Scale configuration
-  constexpr int PackedScalesNum = get<2>(CTAShape{}) / 128;
-  using ElementScalePacked =
-      cutlass::Array<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA, PackedScalesNum>;
+  constexpr bool use_wfp4a16 = std::is_same_v<ElementB, cutlass::float_e2m1_t>;
+  constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size
+                                         : cutlass::gemm::collective::detail::int4_group_size;
+  constexpr int PackedScalesNum = get<2>(CTAShape{}) / group_size;
+  using ElementScale =
+      std::conditional_t<use_wfp4a16, cutlass::float_ue8m0_t,
+                         TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA>;
+  using ElementScalePacked = cutlass::Array<ElementScale, PackedScalesNum>;
   using LayoutScale = cutlass::layout::RowMajor;
 
   // C/D matrix configuration
@@ -164,15 +167,15 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
   Args arguments;
 
   decltype(arguments.epilogue.thread) fusion_args;
-  fusion_args.alpha = 0;
+  fusion_args.alpha = use_wfp4a16 ? 1 : 0;
   fusion_args.beta = 0;
   fusion_args.alpha_ptr = nullptr;
   fusion_args.beta_ptr = nullptr;
-  fusion_args.alpha_ptr_array = inputs.alpha_scales;
+  fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales;
   fusion_args.beta_ptr_array = nullptr;
   // One alpha and beta per each group
-  fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
-  fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
+  fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
+  fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1};
 
   cutlass::KernelHardwareInfo hw_info;
   hw_info.device_id = 0;
@@ -185,7 +188,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
         {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
          reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
          reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-         hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+         hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
         {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c),
          hopper_inputs.stride_c, reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
          hopper_inputs.default_epilogue.stride_d},
@@ -194,13 +197,14 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(
     return;
   }
 
+  assert(group_size == int(inputs.groupwise_quant_group_size));
   arguments = Args{
       cutlass::gemm::GemmUniversalMode::kGrouped,
       {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr},
       {reinterpret_cast<ElementB const**>(hopper_inputs.ptr_b), hopper_inputs.stride_b,
        reinterpret_cast<ElementA const**>(hopper_inputs.ptr_a), hopper_inputs.stride_a,
        reinterpret_cast<ElementScalePacked const**>(hopper_inputs.int4_groupwise_params.ptr_s_a),
-       hopper_inputs.int4_groupwise_params.stride_s_a, int(inputs.groupwise_quant_group_size)},
+       hopper_inputs.int4_groupwise_params.stride_s_a, group_size},
       {fusion_args, reinterpret_cast<ElementC const**>(hopper_inputs.ptr_c), hopper_inputs.stride_c,
        reinterpret_cast<ElementD**>(hopper_inputs.default_epilogue.ptr_d),
        hopper_inputs.default_epilogue.stride_d},
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
new file mode 100644
index 000000000..45d4fa64c
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels {
+#ifdef ENABLE_BF16
+template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>;
+#endif
+}  // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
new file mode 100644
index 000000000..12b79eb4c
--- /dev/null
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels {
+template class MoeGemmRunner<half, __nv_fp4_e2m1, half>;
+}
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
index 2c5dde525..7f9c0fe14 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -96,6 +96,7 @@ struct genericMoeGemmKernelLauncher {
 
     static_assert(cutlass::platform::is_same<T, WeightType>::value ||
                   cutlass::platform::is_same<WeightType, uint8_t>::value ||
+                  cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value ||
                   cutlass::platform::is_same<WeightType, cutlass::uint4b_t>::value);
 
     static_assert(arch::kMinComputeCapability < 90,
@@ -550,7 +551,7 @@ MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getAmpereConfigs(int sm
       weight_only_flag | simt_only_flag | grouped_gemm_flag | enable_hopper | fp8_only_flag);
 
   if (!kernels::cutlass_kernels::isValidAmpereMOESpecialisation<T, WeightType>() ||
-      (use_w4afp8 && sm != 89)) {
+      (use_w4afp8 && sm != 89) || use_wfp4a16) {
     return {};
   }
 
@@ -630,18 +631,19 @@ int MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getSM() const {
 // currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::supportsFusedGatedActivation(
-    bool is_gated_activation, int gemm_n, int gemm_k) const {
+    ActivationType activation_type, int gemm_n, int gemm_k) const {
   constexpr bool ENABLE_FUSED_GATED_ACTIVATION = true;
-  return is_gated_activation && std::is_same_v<T, WeightType> && !std::is_same_v<T, float> &&
-         !use_fp8 && (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) &&
+  return (activation_type == ActivationType::Swiglu || activation_type == ActivationType::Geglu) &&
+         std::is_same_v<T, WeightType> && !std::is_same_v<T, float> && !use_fp8 &&
+         (this->getSM() >= 80) && (gemm_k % 64 == 0) && (gemm_n % 64 == 0) &&
          ENABLE_FUSED_GATED_ACTIVATION;
 }
 
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 bool MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::isFusedGatedActivation(
-    cutlass_extensions::CutlassGemmConfig gemm_config, bool is_gated_activation, int gemm_n,
+    cutlass_extensions::CutlassGemmConfig gemm_config, ActivationType activation_type, int gemm_n,
     int gemm_k) const {
-  return supportsFusedGatedActivation(is_gated_activation, gemm_n, gemm_k) &&
+  return supportsFusedGatedActivation(activation_type, gemm_n, gemm_k) &&
          !gemm_config.is_tma_warp_specialized;
 }
 
@@ -673,22 +675,30 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
                        "Hopper configuration provided for non-Hopper architecture");
 
   if (sm_ >= 75 && sm_ < 80) {
-    dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
-        inputs, multi_processor_count_);
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
+      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm75, EpilogueTag>(
+          inputs, multi_processor_count_);
+    } else {
+      TLLM_THROW("FP4 data type is not supported on SM < 90");
+    }
   } else if (sm_ >= 80 && sm_ < 90) {
-    if constexpr (use_fp8 || use_w4afp8) {
+    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
+      if constexpr (use_fp8 || use_w4afp8) {
 #if defined(ENABLE_FP8)
-      static_assert(
-          !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-          "FP8 GEMM Output not supported");
+        static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> &&
+                          !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+                      "FP8 GEMM Output not supported");
 #endif
-
-      TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
-          inputs, multi_processor_count_);
+        TLLM_CHECK_WITH_INFO(sm_ == 89,
+                             "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
+        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+            inputs, multi_processor_count_);
+      } else {
+        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+            inputs, multi_processor_count_);
+      }
     } else {
-      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
-          inputs, multi_processor_count_);
+      TLLM_THROW("FP4 data type is not supported on SM < 90");
     }
   } else if (sm_ >= 90) {
     // For SM120+ FP8 MoE, redirect to SM89 (Ada) FP8 kernel implementations.
@@ -702,7 +712,7 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 
     if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
                       T, WeightType, EpilogueTag>() &&
-                  !use_w4afp8) {
+                  !use_w4_groupwise) {
       // We allow both tma warp specialized and SM80 configurations to coexist because for some
       // cases with small numbers of tokens SM80 is faster. We check here to see which is selected
       if (inputs.gemm_config.sm_version >= 90) {
@@ -744,25 +754,35 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
 #if defined(ENABLE_FP8)
     // Hopper finegrained INT4 WS grouped GEMM
     if constexpr (use_w4afp8) {
-      if (inputs.gemm_config.is_tma_warp_specialized) {
-        // EpilogueTag is ignored
-        if (inputs.k % 512 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 4>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else if (inputs.k % 256 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 2>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else if (inputs.k % 128 == 0) {
-          sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
-                                                        cutlass_extensions::EpilogueOpDefault, 1>(
-              inputs, hopper_inputs, multi_processor_count_, nullptr);
-        } else {
-          TLLM_THROW("Invalid GEMM K size %d", (int)inputs.k);
-        }
-        return;
-      };
+      TLLM_CHECK_WITH_INFO(inputs.gemm_config.is_tma_warp_specialized,
+                           "w4afp8 is only supported for TMA warp specialization");
+      // EpilogueTag is ignored
+      if (inputs.k % 512 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 4>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else if (inputs.k % 256 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 2>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else if (inputs.k % 128 == 0) {
+        sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                      cutlass_extensions::EpilogueOpDefault, 1>(
+            inputs, hopper_inputs, multi_processor_count_, nullptr);
+      } else {
+        TLLM_THROW("Invalid GEMM K size %d", (int)inputs.k);
+      }
+      return;
+    }
+
+    if constexpr (use_wfp4a16) {
+      TLLM_CHECK_WITH_INFO(inputs.gemm_config.is_tma_warp_specialized,
+                           "wfp4a16 is only supported for TMA warp specialization");
+      // EpilogueTag is ignored
+      sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass<T, WeightType, ScaleBiasType,
+                                                    cutlass_extensions::EpilogueOpDefault, 1>(
+          inputs, hopper_inputs, multi_processor_count_, nullptr);
+      return;
     }
 #endif
 
@@ -810,7 +830,7 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::getMaxWorkspaceS
 template <typename T, typename WeightType, typename OutputType, typename ScaleBiasType>
 size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspaceSize(
     int num_experts) const {
-  if constexpr (use_w4afp8) {
+  if constexpr (use_w4_groupwise) {
     return calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput<T, WeightType, OutputType>(
         num_experts, multi_processor_count_);
   }
@@ -819,7 +839,7 @@ size_t MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::calcMaxWorkspace
   }
   if constexpr (kernels::cutlass_kernels::isValidTmaWarpSpecializedMOESpecialisation<
                     T, WeightType>() &&
-                !use_w4afp8) {
+                !use_w4afp8 && !use_wfp4a16) {
     auto configs = getTmaWarpSpecializedConfigs(sm_);
     auto fpX_block_scaling_type = TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE;
     if constexpr (use_wfp4afp4) {
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
index 21be2eba3..2b13d5772 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h
@@ -69,7 +69,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_mainloop_schedules(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   switch (inputs.gemm_config.mainloop_schedule) {
     case tkc::MainloopScheduleType::COOPERATIVE:
@@ -121,7 +120,6 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_config(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   switch (inputs.gemm_config.cluster_shape) {
     case tkc::ClusterShape::ClusterShape_1x1x1:
       sm90_dispatch_mainloop_schedules<T, WeightType, GemmOutputType, EpilogueTag, CTAShape,
@@ -155,13 +153,15 @@ template <typename T, typename WeightType, typename GemmOutputType, typename Epi
 void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
     GroupedGemmInput<T, WeightType, GemmOutputType, GemmOutputType> inputs,
     TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) {
-  TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
   // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually
   // perform the best for mixed type gemms.
 
-  constexpr int Ktile = 128 * PackedScalesNum / sizeof(T);
-  TLLM_CHECK(sizeof(T) == 1);
+  constexpr int Ntile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 64 : 128;
+  constexpr int Ktile =
+      (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 128 : 128 * PackedScalesNum / sizeof(T);
+  TLLM_CHECK(sizeof(T) == (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 2 : 1);
 
+  using _Ntile = Int<Ntile>;
   using _Ktile = Int<Ktile>;
   switch (inputs.gemm_config.tile_config_sm90) {
     case tkc::CutlassTileConfigSM90::CtaShape64x16x128B:
@@ -181,7 +181,7 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
       break;
     case tkc::CutlassTileConfigSM90::CtaShape64x128x128B:
       sm90_dispatch_moe_mixed_dtype_gemm_config<T, WeightType, GemmOutputType, EpilogueTag,
-                                                Shape<_64, _128, _Ktile>>(
+                                                Shape<_64, _Ntile, _Ktile>>(
           inputs, hopper_inputs, sm_count_, workspace_size);
       break;
     // case tkc::CutlassTileConfigSM90::CtaShape64x256x128B:
@@ -240,14 +240,18 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(
 template <typename T, typename WeightType, typename OutputType>
 size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_count_) {
   size_t count = 0;
+  constexpr int Ktile = (std::is_same_v<WeightType, __nv_fp4_e2m1>) ? 256 : 512;
+  using _Ktile = Int<Ktile>;
+
 #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS
   GroupedGemmInput<T, WeightType, OutputType, OutputType> inputs{};
   inputs.num_experts = num_experts;
-  sm90_generic_mixed_moe_gemm_kernelLauncher<
-      T, WeightType, OutputType, tensorrt_llm::cutlass_extensions::EpilogueOpDefault,
-      Shape<_128, _64, _512>, Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative,
-      cutlass::epilogue::TmaWarpSpecializedCooperative,
-      cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
+  sm90_generic_mixed_moe_gemm_kernelLauncher<T, WeightType, OutputType,
+                                             tensorrt_llm::cutlass_extensions::EpilogueOpDefault,
+                                             Shape<_128, _64, _Ktile>, Shape<_1, _1, _1>,
+                                             cutlass::gemm::KernelTmaWarpSpecializedCooperative,
+                                             cutlass::epilogue::TmaWarpSpecializedCooperative,
+                                             cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>(
       inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count);
 #endif
   return count;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
index 890ebdd5b..d6b89680e 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -67,7 +67,10 @@ constexpr bool isValidHopperMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_MODIFIABLE_TMA_SM90_SUPPORTED)
   return (cutlass::platform::is_same<T, WeightType>::value ||
           (cutlass::platform::is_same<cutlass::uint4b_t, WeightType>::value &&
-           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+           cutlass::platform::is_same<T, __nv_fp8_e4m3>::value) ||
+          (cutlass::platform::is_same<__nv_fp4_e2m1, WeightType>::value &&
+           !cutlass::platform::is_same<T, __nv_fp8_e4m3>::value))
+
 #ifdef ENABLE_FP4
          && !cutlass::platform::is_same<T, __nv_fp4_e2m1>::value
 #endif
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
index 0ad5e3e61..92531c5c6 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -253,10 +253,11 @@ __global__ void perTokenQuantization(QuantT* dst, T const* src, int64_t const nu
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-// FP4 Quantization
+// FP4/MXFP8 Quantization
 
 constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
 constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+constexpr int CVT_ELTS_PER_THREAD = 8;
 constexpr int CVT_FP4_THREADS_PER_WARP = 32;
 constexpr int CVT_FP8_TO_FP4_ELTS_PER_THREAD = 16;
 
@@ -352,28 +353,6 @@ inline __device__ uint64_t fp32_vec_to_e2m1(float2 (&array)[8]) {
 #endif
 }
 
-// Fast reciprocal.
-inline __device__ float reciprocal_approximate_ftz(float a) {
-  float b;
-  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
-  return b;
-}
-
-// Define a 16 bytes packed data type.
-template <class Type>
-struct PackedVec {
-  typename TypeConverter<Type>::Type elts[4];
-  static_assert(sizeof(elts) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
-                "Vector size should match the number of elements per thread.");
-};
-
-template <>
-struct PackedVec<__nv_fp8_e4m3> {
-  __nv_fp8x2_e4m3 elts[8];
-  static_assert(sizeof(elts) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
-                "Vector size should match the number of elements per thread.");
-};
-
 // Convert 4 float2 values into 8 e4m3 values (represented as one uint64_t).
 inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4]) {
   union {
@@ -391,68 +370,32 @@ inline __device__ uint64_t fp32_vec_to_e4m3(float2 (&array)[4]) {
   return u.val;
 }
 
-// Quantizes the provided PackedVec into the uint64_t output
-template <class Type, int SF_VEC_SIZE>
-__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  // Get absolute maximum values among the local 8 values.
-  auto localMax = cuda_abs(vec.elts[0]);
-
-// Local maximum value.
-#pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
-  }
-
-  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-  // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
-  localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
-  if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
-    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
-  }
-  // Get the final absolute maximum values.
-  float vecMax = float(cuda_max(localMax.x, localMax.y));
-
-  // Get the SF (max value of the vector / max value of mxfp8).
-  float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
-  // 8 bits representation of the SF.
-  uint8_t fp8SFVal;
-  // Write the SF to global memory (STG.8).
-  __nv_fp8_e8m0 tmpSFVal;
-  tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-  float SFValueNarrow = static_cast<float>(tmpSFVal);
-  fp8SFVal = tmpSFVal.__x;
-  // Get the output scale (reciprocal of the SFValue).
-  float outputScale = SFValue != 0.f ? reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
-
-  if (SFout) {
-    // Write the SF to global memory (STG.8).
-    *SFout = fp8SFVal;
-  }
-
-  // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
 
-#pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
-    if constexpr (std::is_same_v<Type, half>) {
-      fp2Vals[i] = __half22float2(vec.elts[i]);
-    } else {
-      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
-    }
-    fp2Vals[i].x *= outputScale;
-    fp2Vals[i].y *= outputScale;
-  }
+__device__ __forceinline__ float exp2f_rcp(uint8_t exp) {
+  constexpr uint32_t FP32_EXPONENT_BIAS = 127;
+  return (exp == 0) ? 1 : exp2f(FP32_EXPONENT_BIAS - static_cast<float>(exp));
+}
 
-  // Convert to e4m3 values.
-  uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+  static_assert(sizeof(elts) == sizeof(Type) * CVT_ELTS_PER_THREAD,
+                "Vector size should match the number of elements per thread.");
+};
 
-  // Write the e4m3 values to global memory.
-  return e4m3Vec;
-#else
-  return 0;
-#endif
-}
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+  static_assert(sizeof(elts) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
+                "Vector size should match the number of elements per thread.");
+};
 
 // Quantizes the provided PackedVec into the uint32_t output
 template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
@@ -463,11 +406,11 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 
 // Local maximum value.
 #pragma unroll
-  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) {
     localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
   }
 
-  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
   // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
   localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
   if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
@@ -476,31 +419,34 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   // Get the final absolute maximum values.
   float vecMax = float(cuda_max(localMax.x, localMax.y));
 
-  // Get the SF (max value of the vector / max value of e2m1).
-  // maximum value of e2m1 = 6.0.
-  // TODO: use half as compute data type.
-  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
-  float SFValueNarrow;
   // 8 bits representation of the SF.
   uint8_t fp8SFVal;
+  float outputScale;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
     __nv_fp8_e8m0 tmp;
-    tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    SFValueNarrow = static_cast<float>(tmp);
+    // Scale the max value to the range of E2m1.
+    vecMax *= reciprocal_approximate_ftz(6.0f);
+    tmp.__x = __nv_cvt_float_to_e8m0(vecMax, __NV_SATFINITE, cudaRoundPosInf);
+
     fp8SFVal = tmp.__x;
+    outputScale = vecMax != 0 ? exp2f_rcp(fp8SFVal) : 0.0f;
   } else {
+    // Get the SF (max value of the vector / max value of e2m1).
+    // maximum value of e2m1 = 6.0.
+    // TODO: use half as compute data type.
+    auto SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
     fp8SFVal = tmp.__x;
-    SFValueNarrow = static_cast<float>(tmp);
+    SFValue = static_cast<float>(tmp);
+    // Get the output scale.
+    // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal)) * reciprocal(SFScaleVal))
+    outputScale = vecMax != 0
+                      ? reciprocal_approximate_ftz(SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                      : 0.0f;
   }
-  // Get the output scale.
-  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-  float outputScale =
-      SFValue != 0
-          ? reciprocal_approximate_ftz(SFValueNarrow * reciprocal_approximate_ftz(SFScaleVal))
-          : 0.0f;
 
   if (SFout) {
     // Write the SF to global memory (STG.8).
@@ -508,10 +454,10 @@ __device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   }
 
   // Convert the input to float.
-  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+  float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
 
 #pragma unroll
-  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
     if constexpr (std::is_same_v<Type, half>) {
       fp2Vals[i] = __half22float2(vec.elts[i]);
     } else {
@@ -568,23 +514,24 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
   // maximum value of e2m1 = 6.0.
   // TODO: use half as compute data type.
   float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  float SFValueNarrow;
   // 8 bits representation of the SF.
   uint8_t fp8SFVal;
   // Write the SF to global memory (STG.8).
   if constexpr (UE8M0_SF) {
     __nv_fp8_e8m0 tmp;
     tmp.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
-    SFValue = static_cast<float>(tmp);
+    SFValueNarrow = static_cast<float>(tmp);
     fp8SFVal = tmp.__x;
   } else {
     // Here SFValue is always positive, so E4M3 is the same as UE4M3.
     __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
     fp8SFVal = tmp.__x;
-    SFValue = static_cast<float>(tmp);
+    SFValueNarrow = static_cast<float>(tmp);
   }
   // Get the output scale.
   // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) * reciprocal(SFScaleVal))
-  float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValue) : 0.0f;
+  float outputScale = SFValue != 0 ? SFScaleVal * reciprocal_approximate_ftz(SFValueNarrow) : 0.0f;
 
   if (SFout) {
     // Write the SF to global memory (STG.8).
@@ -611,10 +558,72 @@ __device__ uint64_t cvt_warp_fp8_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
 #endif
 }
 
-template <int SF_VEC_SIZE>
+// Quantizes the provided PackedVec into the uint64_t output
+template <class Type, int SF_VEC_SIZE>
+__device__ uint64_t cvt_warp_fp16_to_mxfp8(PackedVec<Type>& vec, uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = cuda_abs(vec.elts[0]);
+
+// Local maximum value.
+#pragma unroll
+  for (int i = 1; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    localMax = cuda_max(localMax, cuda_abs(vec.elts[i]));
+  }
+
+  constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_ELTS_PER_THREAD;
+  // Get the absolute maximum among all 16 values (two threads for 16, four threads for 32).
+  localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  if constexpr (CVT_NUM_THREADS_PER_SF == 4) {
+    localMax = cuda_max(__shfl_xor_sync(uint32_t(-1), localMax, 2), localMax);
+  }
+  // Get the final absolute maximum values.
+  float vecMax = float(cuda_max(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of mxfp8).
+  float SFValue = vecMax * reciprocal_approximate_ftz(448.0f);
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  __nv_fp8_e8m0 tmpSFVal;
+  tmpSFVal.__x = __nv_cvt_float_to_e8m0(SFValue, __NV_SATFINITE, cudaRoundPosInf);
+  SFValue = static_cast<float>(tmpSFVal);
+  fp8SFVal = tmpSFVal.__x;
+  // Get the output scale (reciprocal of the SFValue).
+  float outputScale = vecMax != 0.f ? reciprocal_approximate_ftz(SFValue) : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_ELTS_PER_THREAD / 2];
+
+#pragma unroll
+  for (int i = 0; i < CVT_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e4m3 values.
+  uint64_t e4m3Vec = fp32_vec_to_e4m3(fp2Vals);
+
+  // Write the e4m3 values to global memory.
+  return e4m3Vec;
+#else
+  return 0;
+#endif
+}
+
 inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> batchIdx, int mIdx,
                                                            int kIdx, std::optional<int> numRows,
-                                                           int numCols) {
+                                                           int numColVecs) {
   // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
   // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
 
@@ -635,9 +644,10 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
   int32_t kTileIdx = (kIdx / 4);
   int64_t kTileStride = 32 * outerMStride;  // 512
 
-  // SF vector size 16. We round the "numCols" up to a multiple of 64.
-  int factor = SF_VEC_SIZE * 4;
-  int32_t numKTiles = (numCols + factor - 1) / factor;
+  // SF vector size 16 or 32. We round the "numCols" up to a multiple of 64 or 128.
+  // It is the same as rounding the "numColVecs" up to a multiple of 4.
+  int32_t numKTiles = (numColVecs + 4 - 1) / 4;
+
   int32_t mTileIdx = mIdx / (32 * 4);
   int64_t mTileStride = numKTiles * kTileStride;
 
@@ -653,7 +663,6 @@ inline __device__ __host__ int64_t get_sf_out_offset_128x4(std::optional<int> ba
   return SFOffset;
 }
 
-template <int SF_VEC_SIZE>
 inline __device__ __host__ int64_t get_sf_out_offset_8x4(std::optional<int> batchIdx, int mIdx,
                                                          int kIdx, std::optional<int> numRows,
                                                          int numCols) {
@@ -673,8 +682,7 @@ inline __device__ __host__ int64_t get_sf_out_offset_8x4(std::optional<int> batc
   int32_t kTileIdx = (kIdx / 4);
   int64_t kTileStride = mTile * mStride;
 
-  int factor = SF_VEC_SIZE * 4;
-  int32_t numKTiles = (numCols + factor - 1) / factor;
+  int32_t numKTiles = (numCols + 4 - 1) / 4;
   int32_t mTileIdx = mIdx / mTile;
   int64_t mTileStride = numKTiles * kTileStride;
 
@@ -687,36 +695,35 @@ inline __device__ __host__ int64_t get_sf_out_offset_8x4(std::optional<int> batc
   return SFOffset;
 }
 
-template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF, int SF_VEC_SIZE>
-__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
-                                                       int colIdx, std::optional<int> numRows,
-                                                       int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+template <class SFType, int CVT_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
+                                                int colVecIdx, std::optional<int> numRows,
+                                                int numColVecs, SFType* SFout,
+                                                QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2 ||
-                CVT_FP4_NUM_THREADS_PER_SF == 4);
+  static_assert(CVT_NUM_THREADS_PER_SF == 1 || CVT_NUM_THREADS_PER_SF == 2 ||
+                CVT_NUM_THREADS_PER_SF == 4);
 
   // One pair of threads write one SF to global memory.
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
-  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED_128x4 ||
-        layout == FP4QuantizationSFLayout::SWIZZLED_8x4) {
+  if (threadIdx.x % CVT_NUM_THREADS_PER_SF == 0) {
+    if (layout == QuantizationSFLayout::SWIZZLED_128x4 ||
+        layout == QuantizationSFLayout::SWIZZLED_8x4) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
-      int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+      int32_t kIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
       int32_t mIdx = rowIdx;
 
-      auto SFOffset =
-          layout == FP4QuantizationSFLayout::SWIZZLED_128x4
-              ? get_sf_out_offset_128x4<SF_VEC_SIZE>(batchIdx, mIdx, kIdx, numRows, numCols)
-              : get_sf_out_offset_8x4<SF_VEC_SIZE>(batchIdx, mIdx, kIdx, numRows, numCols);
+      auto SFOffset = layout == QuantizationSFLayout::SWIZZLED_128x4
+                          ? get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numColVecs)
+                          : get_sf_out_offset_8x4(batchIdx, mIdx, kIdx, numRows, numColVecs);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
-      int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+      int32_t KTileIdx = colVecIdx / CVT_NUM_THREADS_PER_SF;
 
-      int32_t numKTiles = numCols / SF_VEC_SIZE;
+      int32_t numKTiles = numColVecs;
       int64_t mTileStride = numKTiles;
 
       int64_t BTileStride = numRows.value_or(0) * mTileStride;
@@ -731,192 +738,6 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   return nullptr;
 }
 
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4_3d(
-#else
-cvt_fp16_to_fp4_3d(
-#endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
-    uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
-  using PackedVec = PackedVec<Type>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
-      SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;  // 2 or 4
-  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  asm volatile("griddepcontrol.wait;");
-  // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-           colIdx += blockDim.x) {
-        int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD) +
-                           rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-        PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-        // Get the output tensor offset.
-        // Same as inOffset because 8 elements are packed into one uint32_t.
-        int64_t outOffset = inOffset;
-        auto& out_pos = out[outOffset];
-
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-        out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-      }
-    }
-  }
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp8_to_fp4_3d(
-#else
-cvt_fp8_to_fp4_3d(
-#endif
-    int32_t numbatches, int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in,
-    float const* SFScale, uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<__nv_fp8_e4m3>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  // Input tensor batch/row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int batchIdx = 0; batchIdx < numbatches; batchIdx++) {
-      for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-           colIdx += blockDim.x) {
-        int64_t inOffset = batchIdx * numRows * (numCols / CVT_FP4_ELTS_PER_THREAD) +
-                           rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-        PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-        // Get the output tensor offset.
-        // Same as inOffset because 16 elements are packed into one uint64_t.
-        int64_t outOffset = inOffset;
-        auto& out_pos = out[outOffset];
-
-        std::optional<int> optionalBatchIdx = batchIdx;
-        std::optional<int> optionalNumRows = numRows;
-
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numCols, SFout, layout);
-
-        out_pos =
-            cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-      }
-    }
-  }
-#endif
-}
-
-// Use UE4M3 by default.
-template <class Type, int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp16_to_fp4(
-#else
-cvt_fp16_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out,
-    uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<Type>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  asm volatile("griddepcontrol.wait;");
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
-
-      auto sf_out =
-          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-              std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols,
-              SFout, layout);
-
-      out_pos = cvt_warp_fp16_to_fp4<Type, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-    }
-  }
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
-}
-
-// Use UE4M3 by default.
-template <int SF_VEC_SIZE, bool UE8M0_SF>
-__global__ void
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-__launch_bounds__(512, 4) cvt_fp8_to_fp4(
-#else
-cvt_fp8_to_fp4(
-#endif
-    int32_t numRows, int32_t numCols, __nv_fp8_e4m3 const* in, float const* SFScale, uint64_t* out,
-    uint32_t* SFout, FP4QuantizationSFLayout layout) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-  using PackedVec = PackedVec<__nv_fp8_e4m3>;
-  static constexpr int CVT_FP4_NUM_THREADS_PER_SF = SF_VEC_SIZE / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-  static_assert(sizeof(PackedVec) == sizeof(__nv_fp8_e4m3) * CVT_FP8_TO_FP4_ELTS_PER_THREAD,
-                "Vec size is not matched.");
-
-  // Get the global scaling factor, which will be applied to the SF.
-  // Note SFScale is the same as next GEMM's alpha, which is (448.f / (Alpha_A / 6.f)).
-  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
-
-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD;
-         colIdx += blockDim.x) {
-      int64_t inOffset = rowIdx * (numCols / CVT_FP8_TO_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 16 elements are packed into one uint64_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
-
-      auto sf_out =
-          cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_FP4_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-              std::nullopt /* batchIdx */, rowIdx, colIdx, std::nullopt /* numRows */, numCols,
-              SFout, layout);
-
-      out_pos =
-          cvt_warp_fp8_to_fp4<__nv_fp8_e4m3, SF_VEC_SIZE, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
-    }
-  }
-#endif
-}
-
 template <BlockScaleQuantizationType quantization_type, class Type, int SF_VEC_SIZE, bool UE8M0_SF>
 __global__ void
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
@@ -925,13 +746,13 @@ __launch_bounds__(512, 4) quantize_with_block_size(
 quantize_with_block_size(
 #endif
     int32_t numbatches, int32_t numRows, int32_t numCols, int32_t numPaddedCols, Type const* in,
-    float const* SFScale, uint32_t* out, uint32_t* SFout, FP4QuantizationSFLayout layout) {
+    float const* SFScale, uint32_t* out, uint32_t* SFout, QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 
   // The elements per thread.
   static constexpr int ELTS_PER_THREAD = quantization_type == BlockScaleQuantizationType::FP8_TO_FP4
                                              ? CVT_FP8_TO_FP4_ELTS_PER_THREAD
-                                             : CVT_FP4_ELTS_PER_THREAD;
+                                             : CVT_ELTS_PER_THREAD;
 
   using PackedVec = PackedVec<Type>;
   static constexpr int CVT_NUM_THREADS_PER_SF = SF_VEC_SIZE / ELTS_PER_THREAD;  // 2 or 4
@@ -942,14 +763,14 @@ quantize_with_block_size(
   float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
 
   // Is it swizzled layout?
-  bool isSfSwizzledLayout = layout == FP4QuantizationSFLayout::SWIZZLED_128x4 ||
-                            layout == FP4QuantizationSFLayout::SWIZZLED_8x4;
+  bool isSfSwizzledLayout = layout == QuantizationSFLayout::SWIZZLED_128x4 ||
+                            layout == QuantizationSFLayout::SWIZZLED_8x4;
 
   // The number of padded rows considering 128x4 SF layout.
   int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, 128) : numRows;
   int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
 
-  // The number of threads in the column dimension.
+  // The number of threads in the column dimension。
   // Note that numCols/numPaddedCols/numColsForSf are guaranteed to be multiples of ELTS_PER_THREAD.
   int numColThreads = numCols / ELTS_PER_THREAD;
   int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
@@ -964,9 +785,9 @@ quantize_with_block_size(
         std::optional<int> optionalNumRows = numRows;
 
         // The SF output pointer.
-        auto sf_out =
-            cvt_quant_to_fp4_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF, SF_VEC_SIZE>(
-                optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols, SFout, layout);
+        auto sf_out = cvt_quant_get_sf_out_offset<uint32_t, CVT_NUM_THREADS_PER_SF>(
+            optionalBatchIdx, rowIdx, colIdx, optionalNumRows, numPaddedCols / SF_VEC_SIZE, SFout,
+            layout);
 
         // The input tensor offset.
         int64_t inOffset =
@@ -1015,7 +836,7 @@ quantize_with_block_size(
 #endif
 }
 
-__global__ void nvfp4_block_scale_interleave_kernel(int numbatches, int numRows, int numCols,
-                                                    uint8_t const* SFIn, uint8_t* SFOutput);
+__global__ void block_scale_interleave_kernel(int numbatches, int numRows, int numCols,
+                                              uint8_t const* SFIn, uint8_t* SFOutput);
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
index 881b88a03..6f47e6ae5 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.h
@@ -22,7 +22,7 @@
 
 namespace tensorrt_llm {
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -32,8 +32,6 @@ enum class FP4QuantizationSFLayout {
   // For a scale factor row 'i', it maps to data block row: (i % 4) * 32 + (i / 4)
   // Column 'j' in the scale factor block corresponds to scaling the j-th block in the data tensor.
   SWIZZLED_128x4,
-
-  // Similar to SWIZZLED_128x4, but with 8x4 scale factor blocks.
   SWIZZLED_8x4,
 
   // Block scale factors are stored in linear layout (row-major). This is used in some trtllm-gen
@@ -51,14 +49,14 @@ enum class BlockScaleQuantizationType {
 #define PadUpFn(X, Y) ((X + Y - 1) / (Y) * (Y))
 
 // totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
-inline int computeFP4SwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
+inline int64_t computeSwizzledLayoutSFSize(int totalRow, int totalColumn, int rowSize = 128) {
   int paddedRow = PadUpFn(totalRow, rowSize);
   int paddedColumn = PadUpFn(totalColumn, 4);
-  return paddedRow * paddedColumn;
+  return static_cast<int64_t>(paddedRow) * paddedColumn;
 }
 
-inline int computeFP4LinearLayoutSFSize(int totalRow, int totalColumn) {
-  return totalRow * totalColumn;
+inline int64_t computeLinearLayoutSFSize(int totalRow, int totalColumn) {
+  return static_cast<int64_t>(totalRow) * totalColumn;
 }
 
 namespace kernels {
@@ -73,32 +71,23 @@ void invokePerTokenQuantization(QuantT* dst, T const* src, int64_t const numRows
                                 float* sumPtr, tensorrt_llm::common::QuantMode quantMode,
                                 cudaStream_t stream = 0);
 
-template <typename T, int SF_VEC_SIZE = 16>
-void invokeFP4Quantization(int m, int n, T const* input, float const* globalScale, int64_t* output,
-                           int32_t* SFOuput, bool useUE8M0, FP4QuantizationSFLayout layout,
-                           int multiProcessorCount, bool enable_pdl = false,
-                           cudaStream_t stream = 0);
-
-template <typename T, int SF_VEC_SIZE = 16>
-void invokeBatchedFP4Quantization(
-    int b, int m, int n, T const* input, float const* globalScale, int64_t* output,
-    int32_t* SFOuput, bool useUE8M0, int multiProcessorCount,
-    FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED_128x4,
-    bool enable_pdl = false, cudaStream_t stream = 0);
+template <typename T, int SF_VEC_SIZE>
+void invokeFP4Quantization(int b, int m, int n, T const* input, float const* globalScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           QuantizationSFLayout layout, int multiProcessorCount,
+                           bool enable_pdl = false, cudaStream_t stream = 0);
 
-void invokeNVFP4BlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
-                                     uint8_t const* SFIn, uint8_t* SFOutput,
-                                     int multiProcessorCount, cudaStream_t stream = 0);
+void invokeBlockScaleInterleave(int b, int m, int m_padded, int n, int n_padded,
+                                uint8_t const* SFIn, uint8_t* SFOutput, int multiProcessorCount,
+                                cudaStream_t stream = 0);
 
-void invokeNVFP4BlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn,
-                                            uint8_t* SFOutput, int multiProcessorCount,
-                                            cudaStream_t stream = 0);
+void invokeBlockScaleInterleaveReverse(int b, int m, int n, uint8_t const* SFIn, uint8_t* SFOutput,
+                                       int multiProcessorCount, cudaStream_t stream = 0);
 
 template <typename T>
 void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, int64_t* output,
-                             int32_t* SFOuput, FP4QuantizationSFLayout layout,
-                             int multiProcessorCount, bool enable_pdl = false,
-                             cudaStream_t stream = 0);
+                             int32_t* SFOuput, QuantizationSFLayout layout, int multiProcessorCount,
+                             bool enable_pdl = false, cudaStream_t stream = 0);
 
 }  // namespace kernels
 }  // namespace tensorrt_llm
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
index 5defced2e..0fbda766e 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Op.cpp
@@ -100,13 +100,14 @@ float e2M1ToFloat(uint8_t value) {
 // SFMatrix. colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize
 // needed.
 int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
-                   tensorrt_llm::FP4QuantizationSFLayout layout) {
+                   tensorrt_llm::QuantizationSFLayout layout) {
   constexpr int kColumnGroup0Size = 4;
   constexpr int kRowGroup0Size = 32;
   constexpr int kRowGroup1Size = kRowGroup0Size * 4;
 
   // Swizzled layout is used as default layout.
-  if (layout == tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4) {
+  if (layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4 ||
+      layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED_8x4) {
     // int paddedRow = PadUpFn(totalRow, 128);
     int paddedColumn = PadUpFn(totalColumn, 4);
 
@@ -126,7 +127,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
            rowGroupIdx * rowGroupStride;
   }
   // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
-  else if (layout == tensorrt_llm::FP4QuantizationSFLayout::LINEAR) {
+  else if (layout == tensorrt_llm::QuantizationSFLayout::LINEAR) {
     // no padding needed. totalColumn is multiple of kVecSize.
     return rowIdx * totalColumn + colIdx;
   } else {
@@ -137,7 +138,7 @@ int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
 // Interleave (and possibly pad) the weights block scaling factor.
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Return: num_experts * pad_up(rows, 128) * pad_up(cols, 4)
-at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
+at::Tensor BlockScaleInterleave(at::Tensor const& blockScale) {
   bool is_cuda = blockScale.device().is_cuda();
   if (is_cuda) {
     CHECK_INPUT_TYPE(blockScale, SF_DTYPE);
@@ -151,7 +152,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
   auto rows = blockScaleShape.size() == 3 ? blockScaleShape[1] : blockScaleShape[0];
   auto cols = blockScaleShape.size() == 3 ? blockScaleShape[2] : blockScaleShape[1];
 
-  auto expert_out_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(rows, cols);
+  auto expert_out_size = tensorrt_llm::computeSwizzledLayoutSFSize(rows, cols);
   auto rows_padded = PadUpFn(rows, 128);
   auto cols_padded = PadUpFn(cols, 4);
   TORCH_CHECK(expert_out_size == rows_padded * cols_padded,
@@ -163,7 +164,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
   if (is_cuda) {
     const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
     auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-    tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleave(
+    tensorrt_llm::kernels::invokeBlockScaleInterleave(
         num_experts, rows, rows_padded, cols, cols_padded, blockScale.data_ptr<uint8_t>(),
         static_cast<uint8_t*>(interleavedBlockScale.data_ptr()), smCount, stream);
   } else {
@@ -179,7 +180,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
             sf_ori = blockScalePtr[cIdx];
           }
           int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4);
+                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4);
           interleavedBlockScalePtr[sf_index] = sf_ori;
         }
       }
@@ -193,7 +194,7 @@ at::Tensor NVFP4BlockScaleInterleave(at::Tensor const& blockScale) {
 // blockScale: [num_experts, rows, cols] or [rows, cols]
 // Note: rows and cols are the dimensions of the original unswizzled SFMatrix, so reshape input
 // before passing into this function! Return: The same shape as blockScale
-at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
+at::Tensor BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
   bool is_cuda = blockScale.device().is_cuda();
   if (is_cuda) {
     CHECK_INPUT_TYPE(blockScale, SF_DTYPE);
@@ -215,7 +216,7 @@ at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
   if (is_cuda) {
     const thread_local int smCount = tensorrt_llm::common::getMultiProcessorCount();
     auto stream = at::cuda::getCurrentCUDAStream(blockScale.get_device());
-    tensorrt_llm::kernels::invokeNVFP4BlockScaleInterleaveReverse(
+    tensorrt_llm::kernels::invokeBlockScaleInterleaveReverse(
         num_experts, rows, cols, blockScale.data_ptr<uint8_t>(),
         static_cast<uint8_t*>(reversedBlockScale.data_ptr()), smCount, stream);
   } else {
@@ -225,7 +226,7 @@ at::Tensor NVFP4BlockScaleInterleaveReverse(at::Tensor const& blockScale) {
       for (int rIdx = 0; rIdx < rows; ++rIdx) {
         for (int cIdx = 0; cIdx < cols; ++cIdx) {
           int sf_index = computeSFIndex(rIdx, cIdx, rows, cols,
-                                        tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4);
+                                        tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4);
           identity[eIdx * expert_out_size + sf_index] = std::array<int, 3>{eIdx, rIdx, cIdx};
         }
       }
@@ -267,7 +268,7 @@ at::Tensor E2M1AndUFP8SFScaleToFloat(at::Tensor valueE2M1, at::Tensor scaleFP8SF
       uint8_t* scaleFP8SFPtr = scaleFP8SF.data_ptr<uint8_t>();
       uint8_t fp8Scale =
           scaleFP8SFPtr[computeSFIndex(vIdx, group, packedShape[0], groupsPerHiddenDim,
-                                       tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4)];
+                                       tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4)];
       int scale = fp8Scale;
       if (sfType == 0) {
         scale -= 127;
@@ -292,8 +293,8 @@ at::Tensor E2M1AndUFP8SFScaleToFloat(at::Tensor valueE2M1, at::Tensor scaleFP8SF
 
 // Used by the (fp16 -> int4) quant layer + int4 gemm network.
 at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8SF,
-                                       at::Tensor globalScale, int64_t sfVecSize, int64_t sfType,
-                                       bool isSfSwizzledLayout = true) {
+                                       std::optional<at::Tensor> globalScale, int64_t sfVecSize,
+                                       int64_t sfType, bool isSfSwizzledLayout = true) {
   CHECK_CPU_INPUT(valueE2M1, FLOAT4_E2M1X2);
   CHECK_CPU_INPUT(scaleFP8SF, SF_DTYPE);
   auto packedShape = valueE2M1.sizes();
@@ -303,16 +304,21 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
   at::Tensor floatTensor = at::zeros({packedShape[0], packedShape[1] * 2},
                                      at::dtype(at::ScalarType::Float).requires_grad(false));
 
-  CHECK_CPU_INPUT(globalScale, at::ScalarType::Float);
-  float globalScaleVal = globalScale.data_ptr<float>()[0];
+  // CHECK_CPU_INPUT(globalScale, at::ScalarType::Float);
+  float globalScaleVal{1.0f};
+  if (sfType == 1) {
+    TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfType is 1.");
+    // CHECK_CPU_INPUT(globalScale.value(), at::kFloat32);
+    globalScaleVal = globalScale->data_ptr<float>()[0];
+  }
 
   int hiddenDim = packedShape[1] * 2;
   int packedFp4HiddenDim = hiddenDim / 2;
   int groupsPerHiddenDim = hiddenDim / sfVecSize;
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                         : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
+                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t vIdx = 0; vIdx < static_cast<size_t>(packedShape[0]); ++vIdx) {
     for (int group = 0; group < groupsPerHiddenDim; ++group) {
@@ -344,10 +350,62 @@ at::Tensor E2M1AndUFP8SFScaleToFloatV2(at::Tensor valueE2M1, at::Tensor scaleFP8
   return floatTensor;
 }
 
+at::Tensor mxfp4_dequantize_host(at::Tensor weight, at::Tensor scale, int64_t group_size) {
+  // weight (n, k / 2)
+  // scale (n, k / group_size)
+
+  CHECK_CPU_INPUT(weight, FLOAT4_E2M1X2);
+  CHECK_CPU_INPUT(scale, SF_DTYPE);
+  TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+  TORCH_CHECK(scale.is_contiguous(), "scale must be contiguous");
+  TORCH_CHECK(weight.numel() != 0, "weight should not be empty tensor");
+  TORCH_CHECK(weight.dtype() == at::ScalarType::Byte, "Weight must be a packed int8 tensor");
+  TORCH_CHECK(scale.dtype() == at::ScalarType::Byte, "Scale must be a int8 tensor");
+
+  TORCH_CHECK(weight.size(0) == scale.size(0),
+              "weight and scale must have the same number of rows");
+  TORCH_CHECK(weight.size(1) * 2 == scale.size(1) * group_size,
+              "weight and scale must have the same number of columns");
+
+  uint8_t* weight_packed_ptr = weight.data_ptr<uint8_t>();
+  __nv_fp8_e8m0* scale_ptr = reinterpret_cast<__nv_fp8_e8m0*>(scale.data_ptr<uint8_t>());
+
+  int const n = weight.size(0);
+  int const k = weight.size(1) * 2;
+
+  at::Tensor dequant_weight =
+      at::empty({n, k}, at::dtype(at::ScalarType::Float).device(at::kCPU).requires_grad(false));
+  float* dequant_weight_ptr = dequant_weight.data_ptr<float>();
+
+  float fp4_lut[] = {0.0, 0.5,  1.0,  1.5,  2.0,  3.0,  4.0,  6.0,
+                     0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0};
+
+  for (int packed_idx = 0; packed_idx < weight.numel(); ++packed_idx) {
+    int8_t weight_packed_data = weight_packed_ptr[packed_idx];
+
+    uint8_t weight_low_ = weight_packed_data & 0xF;
+    uint8_t weight_high_ = (weight_packed_data & 0xF0) >> 4;
+
+    float weight_low = fp4_lut[weight_low_];
+    float weight_high = fp4_lut[weight_high_];
+
+    int scale_n_idx = packed_idx / (k / 2);
+    int scale_k_idx = ((packed_idx * 2) % k) / group_size;
+
+    float scale_ = static_cast<float>(scale_ptr[scale_n_idx * scale.size(1) + scale_k_idx]);
+
+    dequant_weight_ptr[2 * packed_idx] = weight_low * scale_;
+    dequant_weight_ptr[2 * packed_idx + 1] = weight_high * scale_;
+  }
+
+  return dequant_weight;
+}
+
 }  // namespace torch_ext
 
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
-  m.def("nvfp4_block_scale_interleave", &torch_ext::NVFP4BlockScaleInterleave);
-  m.def("nvfp4_block_scale_interleave_reverse", &torch_ext::NVFP4BlockScaleInterleaveReverse);
-  m.def("e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
+  m.def("block_scale_interleave_sm100", &torch_ext::BlockScaleInterleave);
+  m.def("block_scale_interleave_reverse_sm100", &torch_ext::BlockScaleInterleaveReverse);
+  m.def("e2m1_and_ufp8sf_scale_to_float_sm100", &torch_ext::E2M1AndUFP8SFScaleToFloatV2);
+  m.def("mxfp4_dequantize_host", &torch_ext::mxfp4_dequantize_host);
 }
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
index 55e53d2e2..fb21cf6ac 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.cpp
@@ -35,17 +35,28 @@ namespace torch_ext {
 // mxfp4: sfVecSize = 32, sfUseUE8M0 = true
 // alignment: sfVecSize
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
-// linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts. returns
+// linear layout. See QuantizationSFLayout enum for more details about the two layouts. returns
 // self_fp4, self_block_scale_factors self_fp4: [M, K / 2], FLOAT4_E2M1X2 self_block_scale_factors:
 // ceil(M / 128) * 128 * ceil(K / sfVecSize / 4) * 4, SF_DTYPE (UE4M3 or UE8M0)
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-                                                at::Tensor const& globalScale, int64_t sfVecSize,
-                                                bool sfUseUE8M0, bool isSfSwizzledLayout,
-                                                bool isSf8x4Layout, bool enable_pdl) {
+                                                std::optional<at::Tensor> const& globalScale,
+                                                int64_t sfVecSize, bool sfUseUE8M0,
+                                                bool isSfSwizzledLayout, bool isSf8x4Layout,
+                                                bool enable_pdl) {
   CHECK_TH_CUDA(self);
   CHECK_CONTIGUOUS(self);
-  CHECK_INPUT_TYPE(globalScale, c10::ScalarType::Float);
-  TORCH_CHECK(sfVecSize == 16 || sfVecSize == 32, "sfVecSize can only be 16 or 32");
+  if (sfUseUE8M0) {
+    TORCH_CHECK(sfVecSize == 32, "sfVecSize can only be 32, when sfUseUE8M0 is true");
+  } else {
+    TORCH_CHECK(globalScale.has_value(), "globalScale is required when sfUseUE8M0 is false");
+    // CHECK_INPUT_AND_TYPE(globalScale.value(), torch::kFloat32);
+    TORCH_CHECK(sfVecSize == 16, "sfVecSize can only be 16, when sfUseUE8M0 is false");
+  }
+
+  float* globalScalePtr{nullptr};
+  if (globalScale.has_value()) {
+    globalScalePtr = globalScale->data_ptr<float>();
+  }
 
   auto const& inputShape = self.sizes();
   auto const& rank = inputShape.size();
@@ -64,28 +75,23 @@ std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
   at::Tensor valueE2M1 =
       at::detail::empty_cuda(outputShape, FLOAT4_E2M1X2, self.device(), /* stride */ std::nullopt);
 
-  int64_t SFSize =
-      isSfSwizzledLayout
-          ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
-          : tensorrt_llm::computeFP4LinearLayoutSFSize(m, k / sfVecSize);
+  int64_t SFSize = isSfSwizzledLayout ? tensorrt_llm::computeSwizzledLayoutSFSize(
+                                            m, k / sfVecSize, isSf8x4Layout ? 8 : 128)
+                                      : tensorrt_llm::computeLinearLayoutSFSize(m, k / sfVecSize);
 
   at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, self.device(),
                                                  /* stride */ std::nullopt);  // 1D tensor
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-  auto layout = tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
-  if (isSf8x4Layout) {
-    TORCH_CHECK(isSfSwizzledLayout, "8x4layout must be swizzled layout");
-    layout = tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_8x4;
-  } else {
-    layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                                : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
-  }
+  auto layout = tensorrt_llm::QuantizationSFLayout::LINEAR;
+  layout = isSfSwizzledLayout ? (isSf8x4Layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_8x4
+                                               : tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4)
+                              : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
 #define LAUNCH_FP4_QUANTIZE_KERNEL(T, SF_VEC_SIZE)                                                 \
   tensorrt_llm::kernels::invokeFP4Quantization<T, SF_VEC_SIZE>(                                    \
-      m, k, reinterpret_cast<T*>(self.data_ptr()), globalScale.data_ptr<float>(),                  \
+      1, m, k, reinterpret_cast<T*>(self.data_ptr()), globalScalePtr,                              \
       reinterpret_cast<int64_t*>(valueE2M1.data_ptr()),                                            \
       reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), sfUseUE8M0, layout, mMultiProcessorCount, \
       enable_pdl, at::cuda::getCurrentCUDAStream(self.get_device()));
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
index b664219dc..3476520cc 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp4Quantize.h
@@ -18,12 +18,14 @@
 #include <ATen/cuda/EmptyTensor.h>
 
 #include <cstdint>
+#include <optional>
 
 #include "tensorrt_llm/common/cudaUtils.h"
 
 namespace torch_ext {
 std::tuple<at::Tensor, at::Tensor> fp4_quantize(at::Tensor const& self,
-                                                at::Tensor const& globalScale, int64_t sfVecSize,
-                                                bool sfUseUE8M0, bool isSfSwizzledLayout,
-                                                bool isSf8x4Layout, bool enable_pdl);
+                                                std::optional<at::Tensor> const& globalScale,
+                                                int64_t sfVecSize, bool sfUseUE8M0,
+                                                bool isSfSwizzledLayout, bool isSf8x4Layout,
+                                                bool enable_pdl);
 }  // namespace torch_ext
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
index 0d770d9dc..4b127c09c 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.cpp
@@ -28,14 +28,17 @@ namespace torch_ext {
 
 // input: [M, K], fp32/fp16/bf16/fp8_quantized
 // isSfSwizzledLayout: bool, if true, the scale factors are stored in swizzled layout, otherwise in
-// linear layout. See FP4QuantizationSFLayout enum for more details about the two layouts.
-// alignment: sfVecSize
+// linear layout. See QuantizationSFLayout enum for more details about the two layouts.
 // returns
 std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwizzledLayout,
                                                   int64_t alignment, bool enable_pdl) {
   CHECK_TH_CUDA(input);
   CHECK_CONTIGUOUS(input);
 
+  // Fixed SF_VEC_SIZE as 32
+  static constexpr int SF_VEC_SIZE = 32;
+  TORCH_CHECK(alignment % SF_VEC_SIZE == 0, "alignment must be divisible by SF_VEC_SIZE = 32");
+
   auto const& inputShape = input.sizes();
   auto const& rank = inputShape.size();
 
@@ -45,33 +48,31 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
     m *= inputShape[i];
   }
   auto const k = inputShape[rank - 1];
-  int32_t const sfVecSize = 32;
-  TORCH_CHECK(k % sfVecSize == 0);
+  TORCH_CHECK(k % SF_VEC_SIZE == 0, "k must be divisible by SF_VEC_SIZE = 32");
   auto const padded_k = ((k + alignment - 1) / alignment) * alignment;
 
   std::vector<int64_t> outputShape(inputShape.begin(), inputShape.end());
   outputShape[rank - 1] = padded_k;
 
-  at::Tensor valueFP8 =
-      at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn, input.device(),
-                             /* stride */ std::nullopt);
+  at::Tensor valMxFP8 = at::detail::empty_cuda(outputShape, at::ScalarType::Float8_e4m3fn,
+                                               input.device(), /* stride */ std::nullopt);
 
   int64_t SFSize = isSfSwizzledLayout
-                       ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(m, padded_k / sfVecSize)
-                       : tensorrt_llm::computeFP4LinearLayoutSFSize(m, padded_k / sfVecSize);
+                       ? tensorrt_llm::computeSwizzledLayoutSFSize(m, padded_k / SF_VEC_SIZE)
+                       : tensorrt_llm::computeLinearLayoutSFSize(m, padded_k / SF_VEC_SIZE);
 
   at::Tensor scaleFP8SF = at::detail::empty_cuda({SFSize}, SF_DTYPE, input.device(),
                                                  /* stride */ std::nullopt);  // 1D tensor
 
   const thread_local int mMultiProcessorCount = tensorrt_llm::common::getMultiProcessorCount();
 
-  auto const layout = isSfSwizzledLayout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                                         : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  auto const layout = isSfSwizzledLayout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
+                                         : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
 #define LAUNCH_MXFP8_QUANTIZE_KERNEL(T)                                                            \
-  tensorrt_llm::kernels::invokeMxFP8Quantization<T>(                                               \
+  tensorrt_llm::kernels::invokeMxFP8Quantization(                                                  \
       1, m, k, padded_k, reinterpret_cast<T*>(input.data_ptr()),                                   \
-      reinterpret_cast<int64_t*>(valueFP8.data_ptr()),                                             \
+      reinterpret_cast<int64_t*>(valMxFP8.data_ptr()),                                             \
       reinterpret_cast<int32_t*>(scaleFP8SF.data_ptr()), layout, mMultiProcessorCount, enable_pdl, \
       at::cuda::getCurrentCUDAStream(input.get_device()));
 
@@ -91,7 +92,7 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize(at::Tensor input, bool isSfSwi
 
 #undef LAUNCH_MXFP8_QUANTIZE_KERNEL
 
-  return {valueFP8, scaleFP8SF};
+  return {valMxFP8, scaleFP8SF};
 }
 
 inline uint8_t float_to_ue8m0(float value) {
@@ -124,14 +125,14 @@ std::tuple<at::Tensor, at::Tensor> mxfp8_quantize_host(at::Tensor x_fp32,
                                                 /* pinned */ true, at::MemoryFormat::Contiguous);
   int64_t sf_size =
       is_sf_swizzled_layout
-          ? tensorrt_llm::computeFP4SwizzledLayoutSFSize(num_tokens, hidden_dim / sf_vec_size)
-          : tensorrt_llm::computeFP4LinearLayoutSFSize(num_tokens, hidden_dim / sf_vec_size);
+          ? tensorrt_llm::computeSwizzledLayoutSFSize(num_tokens, hidden_dim / sf_vec_size)
+          : tensorrt_llm::computeLinearLayoutSFSize(num_tokens, hidden_dim / sf_vec_size);
   at::Tensor scale_tensor =
       at::detail::empty_cpu({sf_size}, SF_DTYPE, /* pinned */ true, at::MemoryFormat::Contiguous);
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
+                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
 
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
@@ -180,9 +181,9 @@ at::Tensor mxfp8_dequantize_host(at::Tensor value_e4m3, at::Tensor scale_ue8m08s
   int hidden_dim = data_shape[1];
   int groups_per_hidden_dim = hidden_dim / sf_vec_size;
 
-  tensorrt_llm::FP4QuantizationSFLayout layout =
-      is_sf_swizzled_layout ? tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4
-                            : tensorrt_llm::FP4QuantizationSFLayout::LINEAR;
+  tensorrt_llm::QuantizationSFLayout layout =
+      is_sf_swizzled_layout ? tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4
+                            : tensorrt_llm::QuantizationSFLayout::LINEAR;
   for (size_t ti = 0; ti < static_cast<size_t>(data_shape[0]); ++ti) {
     for (int group = 0; group < groups_per_hidden_dim; ++group) {
       float* float_ptr = float_tensor.data_ptr<float>() + ti * hidden_dim + group * sf_vec_size;
diff --git a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
index 96447720e..6bd7b6771 100644
--- a/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
+++ b/csrc/nv_internal/tensorrt_llm/thop/fp8Quantize.h
@@ -25,13 +25,13 @@
 namespace torch_ext {
 // colIdx and totalCloumn should be in SFMatrix, not activation Matrix, so no sfVecSize needed.
 inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
-                          tensorrt_llm::FP4QuantizationSFLayout layout, bool useUE8M0 = false) {
+                          tensorrt_llm::QuantizationSFLayout layout, bool useUE8M0 = false) {
   constexpr int kColumnGroup0Size = 4;
   constexpr int kRowGroup0Size = 32;
   constexpr int kRowGroup1Size = kRowGroup0Size * 4;
 
   // Swizzled layout is used as default layout.
-  if (layout == tensorrt_llm::FP4QuantizationSFLayout::SWIZZLED_128x4) {
+  if (layout == tensorrt_llm::QuantizationSFLayout::SWIZZLED_128x4) {
     // int paddedRow = PadUpFn(totalRow, 128);
     int paddedColumn = PadUpFn(totalColumn, 4);
 
@@ -51,7 +51,7 @@ inline int computeSFIndex(int rowIdx, int colIdx, int totalRow, int totalColumn,
            rowGroupIdx * rowGroupStride;
   }
   // Linear layout is only used in E2M1AndUFP8SFScaleToFloatV2.
-  else if (layout == tensorrt_llm::FP4QuantizationSFLayout::LINEAR) {
+  else if (layout == tensorrt_llm::QuantizationSFLayout::LINEAR) {
     // no padding needed. totalColumn is multiple of kVecSize.
     return rowIdx * totalColumn + colIdx;
   } else {
diff --git a/csrc/trtllm_allreduce_fusion.cu b/csrc/trtllm_allreduce_fusion.cu
index d54441ec3..330ec5bb4 100644
--- a/csrc/trtllm_allreduce_fusion.cu
+++ b/csrc/trtllm_allreduce_fusion.cu
@@ -70,9 +70,8 @@ void trtllm_allreduce_fusion(
                               ? reinterpret_cast<float*>(scale_factor.value().data_ptr())
                               : nullptr;
     params.use_oneshot = use_oneshot;
-    params.layout = layout_code.has_value()
-                        ? static_cast<FP4QuantizationSFLayout>(layout_code.value())
-                        : FP4QuantizationSFLayout::SWIZZLED;
+    params.layout = layout_code.has_value() ? static_cast<QuantizationSFLayout>(layout_code.value())
+                                            : QuantizationSFLayout::SWIZZLED_128x4;
     params.pattern = static_cast<AllReduceFusionPattern>(pattern_code);
     params.trigger_completion_at_end = trigger_completion_at_end;
     params.stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
index c933a2f7b..0269083ed 100644
--- a/csrc/trtllm_fused_moe_kernel_launcher.cu
+++ b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -823,8 +823,8 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
 
   std::optional<at::Tensor> gemm1_output_scale = std::nullopt;
   if (dtype_act == btg::Dtype::E2m1 || dtype_act == btg::Dtype::MxE4m3) {
-    int64_t sf_size = tensorrt_llm::computeFP4SwizzledLayoutSFSize(max_num_padded_tokens,
-                                                                   intermediate_size / sf_vec_size);
+    int64_t sf_size = tensorrt_llm::computeSwizzledLayoutSFSize(max_num_padded_tokens,
+                                                                intermediate_size / sf_vec_size);
     gemm1_output_scale = at::detail::empty_cuda({sf_size}, at::ScalarType::Float8_e4m3fn,
                                                 hidden_states.device(), std::nullopt);
   }
@@ -883,8 +883,8 @@ std::vector<at::Tensor> trtllm_fp4_block_scale_moe_launcher(
 
     TORCH_CHECK(hidden_states_scale.value().dim() == 1, "hidden_states_scale must be 1D.");
     TORCH_CHECK(hidden_states_scale.value().sizes()[0] ==
-                    tensorrt_llm::computeFP4LinearLayoutSFSize(args.num_tokens,
-                                                               args.hidden_size / sf_vec_size),
+                    tensorrt_llm::computeLinearLayoutSFSize(args.num_tokens,
+                                                            args.hidden_size / sf_vec_size),
                 "hidden_states_scale has incorrect size");
   }
 
diff --git a/csrc/trtllm_moe_allreduce_fusion.cu b/csrc/trtllm_moe_allreduce_fusion.cu
index cec7b3de2..7c2fc22d3 100644
--- a/csrc/trtllm_moe_allreduce_fusion.cu
+++ b/csrc/trtllm_moe_allreduce_fusion.cu
@@ -62,8 +62,8 @@ void trtllm_moe_allreduce_fusion(
         params.rms_eps = static_cast<float>(rms_eps);
         params.scale_factor = static_cast<float>(scale_factor);
         params.layout = layout_code.has_value()
-                            ? static_cast<FP4QuantizationSFLayout>(layout_code.value())
-                            : FP4QuantizationSFLayout::SWIZZLED;
+                            ? static_cast<QuantizationSFLayout>(layout_code.value())
+                            : QuantizationSFLayout::SWIZZLED_128x4;
         params.stream = stream;
 
         params.moe_reduction_device_num_experts = moe_reduction_device_num_experts;
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
index f4e974950..2739740b0 100644
--- a/flashinfer/__init__.py
+++ b/flashinfer/__init__.py
@@ -50,9 +50,13 @@
 from .decode import single_decode_with_kv_cache as single_decode_with_kv_cache
 from .fp4_quantization import (
     SfLayout,
+    block_scale_interleave,
+    nvfp4_block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    nvfp4_block_scale_interleave,
+    mxfp4_dequantize_host,
+    mxfp4_dequantize,
+    mxfp4_quantize,
     nvfp4_quantize,
     shuffle_matrix_a,
     shuffle_matrix_sf_a,
diff --git a/flashinfer/aot.py b/flashinfer/aot.py
index bbbf57f45..5af8d20b3 100644
--- a/flashinfer/aot.py
+++ b/flashinfer/aot.py
@@ -11,8 +11,11 @@
 
 from .activation import act_func_def_str, gen_act_and_mul_module
 from .cascade import gen_cascade_module
-from .fp4_quantization import gen_fp4_quantization_sm100_module
-from .fused_moe import gen_cutlass_fused_moe_sm100_module
+from .fp4_quantization import gen_fp4_quantization_module
+from .fused_moe import (
+    gen_cutlass_fused_moe_sm100_module,
+    gen_cutlass_fused_moe_sm90_module,
+)
 from .gemm import gen_gemm_module, gen_gemm_sm90_module, gen_gemm_sm100_module
 from .jit import JitSpec, build_jit_specs
 from .jit import env as jit_env
@@ -362,11 +365,12 @@ def gen_all_modules(
 
     if add_moe:
         jit_specs.append(gen_gemm_module())
+        jit_specs.append(gen_fp4_quantization_module())
         if has_sm90:
             jit_specs.append(gen_gemm_sm90_module())
+            jit_specs.append(gen_cutlass_fused_moe_sm90_module())
         if has_sm100:
             jit_specs.append(gen_cutlass_fused_moe_sm100_module())
-            jit_specs.append(gen_fp4_quantization_sm100_module())
             jit_specs.append(gen_gemm_sm100_module())
 
     if add_comm:
diff --git a/flashinfer/comm/__init__.py b/flashinfer/comm/__init__.py
index a5ab1baba..f7ae3754a 100644
--- a/flashinfer/comm/__init__.py
+++ b/flashinfer/comm/__init__.py
@@ -5,7 +5,7 @@
 from .trtllm_ar import AllReduceFusionPattern as AllReduceFusionPattern
 from .trtllm_ar import AllReduceStrategyConfig as AllReduceStrategyConfig
 from .trtllm_ar import AllReduceStrategyType as AllReduceStrategyType
-from .trtllm_ar import FP4QuantizationSFLayout as FP4QuantizationSFLayout
+from .trtllm_ar import QuantizationSFLayout as QuantizationSFLayout
 from .trtllm_ar import (
     compute_fp4_swizzled_layout_sf_size as compute_fp4_swizzled_layout_sf_size,
 )
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
index 2209c61dd..66b2eb871 100644
--- a/flashinfer/comm/trtllm_ar.py
+++ b/flashinfer/comm/trtllm_ar.py
@@ -79,7 +79,7 @@ class AllReduceFusionPattern:
     kARResidualRMSNormOutFP4Quant = 5
 
 
-class FP4QuantizationSFLayout:
+class QuantizationSFLayout:
     # Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
     # blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
     # values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -262,7 +262,7 @@ def trtllm_allreduce_fusion(
         rms_gamma: Optional[torch.Tensor],
         rms_eps: Optional[float],
         scale_factor: Optional[Union[torch.Tensor, float]],
-        layout_code: Optional[FP4QuantizationSFLayout],
+        layout_code: Optional[QuantizationSFLayout],
     ) -> None:
         module.trtllm_allreduce_fusion(
             allreduce_in,
@@ -329,7 +329,7 @@ def trtllm_moe_allreduce_fusion(
         moe_reduction_scale_input: torch.Tensor,
         moe_reduction_active_experts_token_input: torch.Tensor,
         moe_reduction_token_input: torch.Tensor,
-        layout_code: Optional[FP4QuantizationSFLayout],
+        layout_code: Optional[QuantizationSFLayout],
         moe_allreduce_out: Optional[torch.Tensor],
         residual_out: Optional[torch.Tensor],
         norm_out: Optional[torch.Tensor],
@@ -790,7 +790,7 @@ def trtllm_allreduce_fusion(
     rms_gamma: Optional[torch.Tensor],
     rms_eps: Optional[float],
     scale_factor: Optional[Union[torch.Tensor, float]],
-    layout_code: Optional[FP4QuantizationSFLayout],
+    layout_code: Optional[QuantizationSFLayout],
 ) -> None:
     """
     Parameters:
@@ -885,7 +885,7 @@ def trtllm_moe_allreduce_fusion(
     moe_reduction_scale_input: torch.Tensor,
     moe_reduction_active_experts_token_input: torch.Tensor,
     moe_reduction_token_input: torch.Tensor,
-    layout_code: Optional[FP4QuantizationSFLayout],
+    layout_code: Optional[QuantizationSFLayout],
     moe_allreduce_out: Optional[torch.Tensor],
     residual_out: Optional[torch.Tensor],
     norm_out: Optional[torch.Tensor],
diff --git a/flashinfer/fp4_quantization.py b/flashinfer/fp4_quantization.py
index 0cdd5c514..6e18955cf 100644
--- a/flashinfer/fp4_quantization.py
+++ b/flashinfer/fp4_quantization.py
@@ -62,9 +62,9 @@ def _pad_scale_factors(
         ).contiguous()
 
 
-def gen_fp4_quantization_sm100_module() -> JitSpec:
+def gen_fp4_quantization_module() -> JitSpec:
     return gen_jit_spec(
-        "fp4_quantization_sm100",
+        "fp4_quantization",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/thop/fp4Quantize.cpp",
@@ -79,10 +79,12 @@ def gen_fp4_quantization_sm100_module() -> JitSpec:
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_cflags=[
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_include_paths=[
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
@@ -92,8 +94,8 @@ def gen_fp4_quantization_sm100_module() -> JitSpec:
 
 
 @functools.cache
-def get_fp4_quantization_sm100_module():
-    module = gen_fp4_quantization_sm100_module().build_and_load()
+def get_fp4_quantization_module():
+    module = gen_fp4_quantization_module().build_and_load()
 
     @register_custom_op(
         "flashinfer::fp4_quantize_sm100",
@@ -152,10 +154,35 @@ def _fake_fp4_quantize_sm100(
         )
 
     @register_custom_op(
-        "flashinfer::nvfp4_block_scale_interleave_sm100",
+        "flashinfer::mxfp4_dequantize_host",
+        mutates_args=(""),
+    )
+    def mxfp4_dequantize_host(
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        group_size: int = 32,
+    ) -> torch.Tensor:
+        return module.mxfp4_dequantize_host(
+            weight,
+            scale,
+            group_size,
+        )
+
+    @register_fake_op("flashinfer::mxfp4_dequantize_host")
+    def _fake_mxfp4_dequantize_host(
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        group_size: int = 32,
+    ) -> torch.Tensor:
+        return weight.new_empty(
+            [weight.shape[0], weight.shape[1] * 2], dtype=torch.float32
+        )
+
+    @register_custom_op(
+        "flashinfer::block_scale_interleave_sm100",
         mutates_args=("",),
     )
-    def nvfp4_block_scale_interleave_sm100(
+    def block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         """Swizzle block scale tensor for FP4 format.
@@ -166,12 +193,12 @@ def nvfp4_block_scale_interleave_sm100(
         Returns:
             torch.Tensor: output tensor for swizzled block scale with dtype uint8.
         """
-        return module.nvfp4_block_scale_interleave(
+        return module.block_scale_interleave_sm100(
             unswizzled_sf,
         )
 
-    @register_fake_op("flashinfer::nvfp4_block_scale_interleave_sm100")
-    def _fake_nvfp4_block_scale_interleave_sm100(
+    @register_fake_op("flashinfer::block_scale_interleave_sm100")
+    def _fake_block_scale_interleave_sm100(
         unswizzled_sf: torch.Tensor,
     ) -> torch.Tensor:
         return unswizzled_sf.new_empty(
@@ -206,7 +233,7 @@ def e2m1_and_ufp8sf_scale_to_float_sm100(
         Returns:
             torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
         """
-        return module.e2m1_and_ufp8sf_scale_to_float(
+        return module.e2m1_and_ufp8sf_scale_to_float_sm100(
             e2m1_tensor.cpu(),
             ufp8_scale_tensor.cpu().reshape(-1),
             global_scale_tensor.cpu(),
@@ -231,8 +258,9 @@ def _fake_e2m1_and_ufp8sf_scale_to_float_sm100(
     # Register the module
     return SimpleNamespace(
         fp4_quantize_sm100=fp4_quantize_sm100,
-        nvfp4_block_scale_interleave_sm100=nvfp4_block_scale_interleave_sm100,
+        block_scale_interleave_sm100=block_scale_interleave_sm100,
         e2m1_and_ufp8sf_scale_to_float_sm100=e2m1_and_ufp8sf_scale_to_float_sm100,
+        mxfp4_dequantize_host=mxfp4_dequantize_host,
     )
 
 
@@ -282,7 +310,7 @@ def fp4_quantize(
     assert input.shape[-1] % sf_vec_size == 0
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
-    x_q, sf = get_fp4_quantization_sm100_module().fp4_quantize_sm100(
+    x_q, sf = get_fp4_quantization_module().fp4_quantize_sm100(
         input,
         global_scale,
         sf_vec_size,
@@ -299,7 +327,7 @@ def fp4_quantize(
     return x_q, sf
 
 
-def nvfp4_block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
+def block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     """Swizzle block scale tensor for FP4 format.
 
     This function swizzles the block scale tensor to optimize memory access patterns
@@ -318,11 +346,15 @@ def nvfp4_block_scale_interleave(unswizzled_sf: torch.Tensor) -> torch.Tensor:
     assert unswizzled_sf.dtype == torch.uint8, (
         f"Input dtype must be uint8, got {unswizzled_sf.dtype}"
     )
-    return get_fp4_quantization_sm100_module().nvfp4_block_scale_interleave_sm100(
+    return get_fp4_quantization_module().block_scale_interleave_sm100(
         unswizzled_sf,
     )
 
 
+# Maintain compatibility with libraries using the old name
+nvfp4_block_scale_interleave = block_scale_interleave
+
+
 def e2m1_and_ufp8sf_scale_to_float(
     e2m1_tensor: torch.Tensor,
     ufp8_scale_tensor: torch.Tensor,
@@ -348,8 +380,7 @@ def e2m1_and_ufp8sf_scale_to_float(
         torch.Tensor: Dequantized float tensor of shape [M, K] with dtype float32.
 
     """
-
-    return get_fp4_quantization_sm100_module().e2m1_and_ufp8sf_scale_to_float_sm100(
+    return get_fp4_quantization_module().e2m1_and_ufp8sf_scale_to_float_sm100(
         e2m1_tensor,
         ufp8_scale_tensor,
         global_scale_tensor,
@@ -389,7 +420,7 @@ def shuffle_matrix_sf_a(
     w_shuffled = input_tensor[row_indices.to(input_tensor.device)]
 
     # 128x4
-    return nvfp4_block_scale_interleave(w_shuffled)
+    return block_scale_interleave(w_shuffled)
 
 
 class SfLayout(Enum):
@@ -427,6 +458,7 @@ def nvfp4_quantize(
             - Quantized tensor of shape [M, K/2] with dtype FLOAT4_E2M1X2
             - Scale factors tensor with shape determined by layout and sf_vec_size
     """
+
     if do_shuffle:
         # Weights 128x4 + shuffle. It is done during the model load and we do not care much about the perf
         assert sfLayout == SfLayout.layout_128x4
@@ -459,3 +491,64 @@ def nvfp4_quantize(
         )
 
     return a_fp4, a_sf
+
+
+def mxfp4_quantize(a):
+    """
+    Quantize input tensor to MXFP4 format.
+
+    Parameters:
+        a (torch.Tensor): Input tensor of shape [M, K] with dtype fp16/bf16.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+            - Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+    """
+    a_global_sf = (448 * 6) / a.float().abs().nan_to_num().max()
+    a_fp4, a_sf = fp4_quantize(a.cuda(), a_global_sf.cuda(), 32, True, True)
+    return a_fp4, a_sf
+
+
+def mxfp4_dequantize(a_fp4, a_sf):
+    """
+    Dequantize input tensor from MXFP4 format.
+
+    Parameters:
+        a_fp4 (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+        a_sf (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+
+    Returns:
+        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
+    """
+    return e2m1_and_ufp8sf_scale_to_float(
+        a_fp4.cpu().view(torch.uint8),
+        a_sf.cpu().view(torch.uint8).reshape(-1),
+        torch.tensor([1.0], device=a_fp4.device),
+        32,
+        0,
+        True,
+    )
+
+
+def mxfp4_dequantize_host(
+    weight: torch.Tensor,
+    scale: torch.Tensor,
+    group_size: int = 32,
+) -> torch.Tensor:
+    """
+    Dequantize input tensor from MXFP4 format on host.
+
+    Parameters:
+        weight (torch.Tensor): Quantized tensor of shape [M, K/2] with dtype uint8 (FLOAT4_E2M1X2)
+        scale (torch.Tensor): Scale factors tensor with shape determined by layout and sf_vec_size (uint8)
+        group_size (int, optional): Group size for dequantization. Defaults to 32.
+
+    Returns:
+        torch.Tensor: Dequantized tensor of shape [M, K] with dtype float.
+    """
+    return get_fp4_quantization_module().mxfp4_dequantize_host(
+        weight,
+        scale,
+        group_size,
+    )
diff --git a/flashinfer/fp8_quantization.py b/flashinfer/fp8_quantization.py
index 7aba9e3e1..86fd3062d 100644
--- a/flashinfer/fp8_quantization.py
+++ b/flashinfer/fp8_quantization.py
@@ -7,7 +7,11 @@
 from .jit import JitSpec
 from .jit import env as jit_env
 from .jit import gen_jit_spec, sm100a_nvcc_flags
-from .utils import device_support_pdl, register_custom_op, register_fake_op
+from .utils import (
+    device_support_pdl,
+    register_custom_op,
+    register_fake_op,
+)
 
 
 def gen_mxfp8_quantization_sm100_module() -> JitSpec:
@@ -26,10 +30,12 @@ def gen_mxfp8_quantization_sm100_module() -> JitSpec:
         + [
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_cflags=[
             "-DENABLE_BF16",
             "-DENABLE_FP8",
+            "-DENABLE_FP4",
         ],
         extra_include_paths=[
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal",
@@ -84,6 +90,7 @@ def mxfp8_quantize_sm100(
     def _fake_mxfp8_quantize_sm100(
         input: torch.Tensor,
         is_sf_swizzled_layout: bool = True,
+        alignment: int = 32,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         m, k = input.shape
         return (
diff --git a/flashinfer/fused_moe/__init__.py b/flashinfer/fused_moe/__init__.py
index 47bf098ac..f7319d1fd 100644
--- a/flashinfer/fused_moe/__init__.py
+++ b/flashinfer/fused_moe/__init__.py
@@ -20,6 +20,7 @@
     convert_to_block_layout,
     cutlass_fused_moe,
     gen_cutlass_fused_moe_sm100_module,
+    gen_cutlass_fused_moe_sm90_module,
     reorder_rows_for_gated_act_gemm,
     trtllm_fp4_block_scale_moe,
     trtllm_fp4_block_scale_routed_moe,
@@ -33,6 +34,7 @@
     "convert_to_block_layout",
     "cutlass_fused_moe",
     "gen_cutlass_fused_moe_sm100_module",
+    "gen_cutlass_fused_moe_sm90_module",
     "reorder_rows_for_gated_act_gemm",
     "trtllm_fp4_block_scale_moe",
     "trtllm_fp8_block_scale_moe",
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
index 94453fa5b..07df2b991 100644
--- a/flashinfer/fused_moe/core.py
+++ b/flashinfer/fused_moe/core.py
@@ -31,7 +31,7 @@
 )
 from ..jit import JitSpec
 from ..jit import env as jit_env
-from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags
+from ..jit import gen_jit_spec, setup_cubin_loader, sm100a_nvcc_flags, sm90a_nvcc_flags
 from ..jit.cubin_loader import get_cubin
 from ..jit.cutlass_gemm.generate_kernels import generate_gemm_operations
 from ..utils import (
@@ -172,57 +172,39 @@ def convert_to_block_layout(input_tensor: torch.Tensor, blockK: int) -> torch.Te
 
 
 def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
-    output_dir = (
-        jit_env.FLASHINFER_CSRC_DIR / "nv_internal/tensorrt_llm/cutlass_instantiations/"
-    )
-
-    required_kernels_sm100 = [
-        # M128 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_BS_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group3.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group4.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group5.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group6.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group7.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M128_group8.generated.cu",
-        # M256 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_BS_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M256_group3.generated.cu",
-        # M64 kernels
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group2.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group3.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group4.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm100_M64_group5.generated.cu",
+    nvcc_flags = sm100a_nvcc_flags + [
+        "-DCOMPILE_BLACKWELL_TMA_GEMMS",
+        "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
     ]
-    required_kernels_sm80 = [
-        # M128 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M128_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M128_group1.generated.cu",
-        # M16 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group1.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M16_group2.generated.cu",
-        # M32 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M32_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M32_group1.generated.cu",
-        # M64 kernels
-        "cutlass_kernel_file_gemm_grouped_sm80_M64_group0.generated.cu",
-        "cutlass_kernel_file_gemm_grouped_sm80_M64_group1.generated.cu",
+    return gen_cutlass_fused_moe_module(nvcc_flags, "100", use_fast_build)
+
+
+def gen_cutlass_fused_moe_sm90_module(use_fast_build: bool = False) -> JitSpec:
+    nvcc_flags = sm90a_nvcc_flags + [
+        "-DCOMPILE_HOPPER_TMA_GEMMS",
+        "-DCOMPILE_HOPPER_TMA_GROUPED_GEMMS",
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP4",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
     ]
-    group_gemm_sm100_dir = output_dir / "gemm_grouped/100"
-    group_gemm_sm80_dir = output_dir / "gemm_grouped/80"
+    return gen_cutlass_fused_moe_module(nvcc_flags, "90", use_fast_build)
+
+
+def gen_cutlass_fused_moe_module(
+    nvcc_flags: List[str], device_arch: str, use_fast_build: bool = False
+) -> JitSpec:
+    """
+    Generate a JitSpec for the cutlass fused moe module.
+    """
+    output_dir = (
+        jit_env.FLASHINFER_CSRC_DIR
+        / f"nv_internal/tensorrt_llm/cutlass_instantiations/{device_arch}"
+    )
 
     try:
         # Create output directory if it doesn't exist
@@ -230,14 +212,14 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
 
         generate_gemm_operations(
             output_dir,
-            "100;100-real",
+            f"{device_arch};{device_arch}-real",
         )
 
     except Exception as e:
         raise RuntimeError(f"Failed to generate Cutlass kernels: {e}") from e
 
     return gen_jit_spec(
-        "fused_moe_cutlass_sm100",
+        f"fused_moe_{device_arch}",
         [
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu",
@@ -266,14 +248,17 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_bf16.cu",
             jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu",
+            jit_env.FLASHINFER_CSRC_DIR
+            / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu",
+            jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_stub.cu",
             jit_env.FLASHINFER_CSRC_DIR
             / "fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_ops.cu",
             jit_env.FLASHINFER_CSRC_DIR
             / "fused_moe/cutlass_backend/cutlass_fused_moe_instantiation.cu",
             # Add all generated kernels
-            *(group_gemm_sm100_dir / kernel for kernel in required_kernels_sm100),
-            *(group_gemm_sm80_dir / kernel for kernel in required_kernels_sm80),
+            *(output_dir / kernel for kernel in output_dir.rglob("*.generated.cu")),
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/envUtils.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/logger.cpp",
             jit_env.FLASHINFER_CSRC_DIR / "nv_internal/cpp/common/stringUtils.cpp",
@@ -286,16 +271,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
             jit_env.FLASHINFER_CSRC_DIR
             / "nv_internal/tensorrt_llm/kernels/lora/lora.cpp",
         ],
-        extra_cuda_cflags=sm100a_nvcc_flags
-        + [
-            "-DENABLE_BF16",
-            "-DENABLE_FP8",
-            "-DENABLE_FP4",
-            "-DCOMPILE_BLACKWELL_TMA_GEMMS",
-            "-DCOMPILE_BLACKWELL_TMA_GROUPED_GEMMS",
-            "-DCOMPILE_HOPPER_TMA_GEMMS",
-            "-DUSING_OSS_CUTLASS_MOE_GEMM",
-        ],
+        extra_cuda_cflags=nvcc_flags,
         extra_cflags=["-DFAST_BUILD"] if use_fast_build else [],
         extra_ldflags=["-lcuda"],
         extra_include_paths=[
@@ -322,10 +298,17 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
 
 
 @functools.cache
-def get_cutlass_fused_moe_sm100_module(use_fast_build: bool = False):
-    FusedMoeRunner = gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load(
-        class_name="FusedMoeRunner"
-    )
+def get_cutlass_fused_moe_module(backend: str = "100", use_fast_build: bool = False):
+    if backend == "100":
+        FusedMoeRunner = gen_cutlass_fused_moe_sm100_module(
+            use_fast_build
+        ).build_and_load(class_name="FusedMoeRunner")
+    elif backend == "90":
+        FusedMoeRunner = gen_cutlass_fused_moe_sm90_module(
+            use_fast_build
+        ).build_and_load(class_name="FusedMoeRunner")
+    else:
+        raise ValueError(f"Invalid backend: {backend}")
 
     class MoERunner(TunableRunner):
         # avoid overhead of creating a new runner in forward pass
@@ -357,7 +340,7 @@ def __init__(
             cluster_rank: int,
             enable_alltoall: bool,
             use_deepseek_fp8_block_scale: bool,
-            use_w4a8_group_scaling: bool,
+            use_w4_group_scaling: bool,
             use_mxfp8_act_scaling: bool,
             min_latency_mode: bool,
             enable_pdl: bool,
@@ -374,7 +357,7 @@ def __init__(
             self.cluster_rank = cluster_rank
             self.enable_alltoall = enable_alltoall
             self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
-            self.use_w4a8_group_scaling = use_w4a8_group_scaling
+            self.use_w4_group_scaling = use_w4_group_scaling
             self.use_mxfp8_act_scaling = use_mxfp8_act_scaling
             self.min_latency_mode = min_latency_mode
             self.enable_pdl = enable_pdl
@@ -383,7 +366,7 @@ def __init__(
                 weight_dtype,
                 output_dtype,
                 use_deepseek_fp8_block_scale,
-                use_w4a8_group_scaling,
+                use_w4_group_scaling,
                 use_mxfp8_act_scaling,
             )
 
@@ -393,7 +376,7 @@ def __init__(
                     weight_dtype,
                     output_dtype,
                     use_deepseek_fp8_block_scale,
-                    use_w4a8_group_scaling,
+                    use_w4_group_scaling,
                     use_mxfp8_act_scaling,
                 )
 
@@ -456,10 +439,10 @@ def refine_tuning_config(cls, tune_max_num_tokens: int):
             )
 
     @register_custom_op(
-        "flashinfer::cutlass_fused_moe_sm100",
+        "flashinfer::cutlass_fused_moe",
         mutates_args=(""),
     )
-    def cutlass_fused_moe_sm100(
+    def cutlass_fused_moe(
         output: torch.Tensor,
         input: torch.Tensor,
         token_selected_experts: torch.Tensor,
@@ -471,6 +454,9 @@ def cutlass_fused_moe_sm100(
         output_dtype: torch.dtype,
         quant_scales: List[torch.Tensor],
         input_sf: Optional[torch.Tensor] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
         tp_size: int = 1,
         tp_rank: int = 0,
         ep_size: int = 1,
@@ -479,7 +465,7 @@ def cutlass_fused_moe_sm100(
         cluster_rank: int = 0,
         enable_alltoall: bool = False,
         use_deepseek_fp8_block_scale: bool = False,
-        use_w4a8_group_scaling: bool = False,
+        use_w4_group_scaling: bool = False,
         use_mxfp8_act_scaling: bool = False,
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
@@ -504,7 +490,7 @@ def cutlass_fused_moe_sm100(
             cluster_rank=cluster_rank,
             enable_alltoall=enable_alltoall,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-            use_w4a8_group_scaling=use_w4a8_group_scaling,
+            use_w4_group_scaling=use_w4_group_scaling,
             use_mxfp8_act_scaling=use_mxfp8_act_scaling,
             min_latency_mode=min_latency_mode,
             enable_pdl=enable_pdl,
@@ -554,6 +540,9 @@ def cutlass_fused_moe_sm100(
             fc2_expert_biases,
             quant_scales,
             input_sf,
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
             tp_size,
             tp_rank,
             ep_size,
@@ -568,8 +557,8 @@ def cutlass_fused_moe_sm100(
 
         return result if min_latency_mode else [result]
 
-    @register_fake_op("flashinfer::cutlass_fused_moe_sm100")
-    def _fake_cutlass_fused_moe_sm100(
+    @register_fake_op("flashinfer::cutlass_fused_moe")
+    def _fake_cutlass_fused_moe(
         output: torch.Tensor,
         input: torch.Tensor,
         token_selected_experts: torch.Tensor,
@@ -581,6 +570,9 @@ def _fake_cutlass_fused_moe_sm100(
         output_dtype: torch.dtype,
         quant_scales: List[torch.Tensor],
         input_sf: Optional[torch.Tensor] = None,
+        swiglu_alpha: Optional[torch.Tensor] = None,
+        swiglu_beta: Optional[torch.Tensor] = None,
+        swiglu_limit: Optional[torch.Tensor] = None,
         tp_size: int = 1,
         tp_rank: int = 0,
         ep_size: int = 1,
@@ -589,7 +581,7 @@ def _fake_cutlass_fused_moe_sm100(
         cluster_rank: int = 0,
         enable_alltoall: bool = False,
         use_deepseek_fp8_block_scale: bool = False,
-        use_w4a8_group_scaling: bool = False,
+        use_w4_group_scaling: bool = False,
         use_mxfp8_act_scaling: bool = False,
         min_latency_mode: bool = False,
         tune_max_num_tokens: int = 8192,
@@ -614,7 +606,7 @@ def _fake_cutlass_fused_moe_sm100(
 
     # Register the module
     return SimpleNamespace(
-        cutlass_fused_moe_sm100=cutlass_fused_moe_sm100,
+        cutlass_fused_moe=cutlass_fused_moe,
     )
 
 
@@ -630,6 +622,9 @@ def cutlass_fused_moe(
     fc1_expert_biases: Optional[torch.Tensor] = None,
     fc2_expert_biases: Optional[torch.Tensor] = None,
     input_sf: Optional[torch.Tensor] = None,
+    swiglu_alpha: Optional[torch.Tensor] = None,
+    swiglu_beta: Optional[torch.Tensor] = None,
+    swiglu_limit: Optional[torch.Tensor] = None,
     tp_size: int = 1,
     tp_rank: int = 0,
     ep_size: int = 1,
@@ -639,7 +634,7 @@ def cutlass_fused_moe(
     output: Optional[torch.Tensor] = None,
     enable_alltoall: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
-    use_w4a8_group_scaling: bool = False,
+    use_w4_group_scaling: bool = False,
     use_mxfp8_act_scaling: bool = False,
     min_latency_mode: bool = False,
     tune_max_num_tokens: int = 8192,
@@ -700,6 +695,15 @@ def cutlass_fused_moe(
     input_sf : Optional[torch.Tensor]
         Input scaling factor for quantization.
 
+    swiglu_alpha : Optional[torch.Tensor]
+        Swiglu alpha for swiglu activation.
+
+    swiglu_beta : Optional[torch.Tensor]
+        Swiglu beta for swiglu activation.
+
+    swiglu_limit : Optional[torch.Tensor]
+        Swiglu limit for swiglu activation.
+
     tp_size : int = 1
         Tensor parallelism size. Defaults to 1.
 
@@ -727,7 +731,7 @@ def cutlass_fused_moe(
     use_deepseek_fp8_block_scale : bool = False
         Whether to use FP8 block scaling. Defaults to False.
 
-    use_w4a8_group_scaling : bool = False
+    use_w4_group_scaling : bool = False
         Whether to use W4A8 group scaling. Defaults to False.
 
     use_mxfp8_act_scaling : bool = False
@@ -764,14 +768,9 @@ def cutlass_fused_moe(
         raise NotImplementedError(
             "DeepSeek FP8 Block Scaling is not yet implemented in CUTLASS for Blackwell."
         )
-    if use_w4a8_group_scaling:
-        raise NotImplementedError(
-            "W4A8 Group Scaling is not yet implemented for Blackwell."
-        )
     if min_latency_mode:
         raise NotImplementedError("min latency mode not yet implemented for Blackwell.")
-    if use_mxfp8_act_scaling:
-        raise NotImplementedError("mxfp8 not yet implemented for Blackwell.")
+
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
 
@@ -788,7 +787,10 @@ def cutlass_fused_moe(
             output, output_shape, output_dtype, input.device, "output"
         )
 
-    return get_cutlass_fused_moe_sm100_module().cutlass_fused_moe_sm100(
+    major, minor = torch.cuda.get_device_capability()
+    device_arch = f"{major * 10 + minor}"
+
+    return get_cutlass_fused_moe_module(device_arch).cutlass_fused_moe(
         output,
         input,
         token_selected_experts,
@@ -800,6 +802,9 @@ def cutlass_fused_moe(
         output_dtype,
         quant_scales,
         input_sf,
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
         tp_size,
         tp_rank,
         ep_size,
@@ -808,7 +813,7 @@ def cutlass_fused_moe(
         cluster_rank,
         enable_alltoall=enable_alltoall,
         use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
-        use_w4a8_group_scaling=use_w4a8_group_scaling,
+        use_w4_group_scaling=use_w4_group_scaling,
         use_mxfp8_act_scaling=use_mxfp8_act_scaling,
         min_latency_mode=min_latency_mode,
         tune_max_num_tokens=tune_max_num_tokens,
diff --git a/flashinfer/fused_moe/utils.py b/flashinfer/fused_moe/utils.py
index a47d5b2bc..963a8af98 100644
--- a/flashinfer/fused_moe/utils.py
+++ b/flashinfer/fused_moe/utils.py
@@ -95,7 +95,7 @@ def swizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: int
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave(sf)
+    return torch.ops.trtllm.block_scale_interleave(sf)
 
 
 def unswizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: int = 16):
@@ -110,7 +110,7 @@ def unswizzle_sf(sf: torch.Tensor, rows: int, cols: int, scaling_vector_size: in
     """
     sf_cols = ceil_div(cols, scaling_vector_size)
     sf = sf.view(-1, rows, sf_cols)
-    return torch.ops.trtllm.nvfp4_block_scale_interleave_reverse(sf).view(-1, sf_cols)
+    return torch.ops.trtllm.block_scale_interleave_reverse(sf).view(-1, sf_cols)
 
 
 @torch.library.custom_op("trtllm::reswizzle_sf", mutates_args=())
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
index 98ef00f14..69de0d3bf 100644
--- a/flashinfer/jit/core.py
+++ b/flashinfer/jit/core.py
@@ -61,12 +61,12 @@ def clear_cache_dir():
         shutil.rmtree(jit_env.FLASHINFER_JIT_DIR)
 
 
-sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"]
-sm100a_nvcc_flags = [
-    "-gencode=arch=compute_100a,code=sm_100a",
+common_nvcc_flags = [
     "-DFLASHINFER_ENABLE_FP8_E8M0",
     "-DFLASHINFER_ENABLE_FP4_E2M1",
 ]
+sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"] + common_nvcc_flags
+sm100a_nvcc_flags = ["-gencode=arch=compute_100a,code=sm_100a"] + common_nvcc_flags
 
 
 @dataclasses.dataclass
diff --git a/flashinfer/jit/cutlass_gemm/generate_kernels.py b/flashinfer/jit/cutlass_gemm/generate_kernels.py
index c6133d36a..8215042c9 100644
--- a/flashinfer/jit/cutlass_gemm/generate_kernels.py
+++ b/flashinfer/jit/cutlass_gemm/generate_kernels.py
@@ -1,6 +1,6 @@
 import enum
 import os
-from itertools import product
+from itertools import chain, product
 
 from .cutlass_library import *
 
@@ -101,6 +101,9 @@ def GetDataTypeNames(type, is_mx_fpx=None):
     DataType.bf16: "__nv_bfloat16",
     DataType.f16: "half",
     DataType.f32: "float",
+    DataType.e2m1: "__nv_fp4_e2m1",
+    DataType.ue8m0: "cutlass::float_ue8m0_t",
+    DataType.u4: "cutlass::uint4b_t",
 }
 
 
@@ -217,7 +220,7 @@ def instantiate_operation_tma_warp_specialized(operation):
             operation.act_type != DataType.e4m3 or operation.weight_type != e2m1
         ):
             # Mixed MoE GEMM
-            weight_tag = DataTypeTag[operation.weight_type]
+            weight_tag = CudaTypeName[operation.weight_type]
             instantiation = f"""
 template void sm90_generic_mixed_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {out_tag},
 {epi_tag}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}, {quant_op}> (
@@ -577,10 +580,16 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     if not is_arch_enabled:
         return []
     arch = 90
-    supported_dtypes = [
+
+    # act_type, weight_type, scalezero_type, bias_type, output_type
+    supported_dtypes_int4 = [
         (DataType.e4m3, DataType.u4, DataType.f16, DataType.f16, DataType.f16),
         (DataType.e4m3, DataType.u4, DataType.bf16, DataType.bf16, DataType.bf16),
     ]
+    supported_dtypes_fp4 = [
+        (DataType.f16, DataType.e2m1, DataType.ue8m0, DataType.f16, DataType.f16),
+        (DataType.bf16, DataType.e2m1, DataType.ue8m0, DataType.bf16, DataType.bf16),
+    ]
 
     quant_ops = [TrtLlm_QuantOp.finegrained_scale_only]
 
@@ -589,16 +598,26 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled):
     M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
     N_TILES = [16, 32, 64, 128]
     K_TILES = [128, 256, 512]
-    cta_shapes_mnk = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_int4 = list(product(M_TILES, N_TILES, K_TILES))
+
+    M_TILES = [64, 128]  # Currently M tile must be 128 for Grouped GEMM
+    N_TILES = [16, 32, 64]
+    K_TILES = [128, 256]
+    cta_shapes_mnk_fp4 = list(product(M_TILES, N_TILES, K_TILES))
+    cta_shapes_mnk_fp4.append((128, 128, 128))
 
     warp_shape = [0, 0, 0]  # ignored except for naming
     stages = 0  # auto
 
-    cga_shapes = product([1, 2], [1, 2], [1])
+    cga_shapes = list(product([1, 2], [1, 2], [1]))
 
-    partial_args = product(
-        supported_dtypes, quant_ops, epi_tags, cta_shapes_mnk, cga_shapes
+    partial_args_int4 = product(
+        supported_dtypes_int4, quant_ops, epi_tags, cta_shapes_mnk_int4, cga_shapes
+    )
+    partial_args_fp4 = product(
+        supported_dtypes_fp4, quant_ops, epi_tags, cta_shapes_mnk_fp4, cga_shapes
     )
+    partial_args = chain(partial_args_int4, partial_args_fp4)
 
     operations = list()
     for dtype_combo, quant_op, epi_tag, cta_shape_mnk, cga_shape in partial_args:
diff --git a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
index dc48372cc..85f2195c4 100644
--- a/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
+++ b/include/flashinfer/comm/trtllm_allreduce_fusion.cuh
@@ -16,7 +16,7 @@ namespace flashinfer {
 
 namespace trtllm_allreduce_fusion {
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -488,7 +488,7 @@ template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
 __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
                                                        int colIdx, std::optional<int> numRows,
                                                        int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+                                                       QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
 
@@ -496,7 +496,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
   if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED) {
+    if (layout == QuantizationSFLayout::SWIZZLED_128x4) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
       int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
@@ -504,7 +504,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
 
       auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numCols);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
       int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
 
@@ -762,7 +762,7 @@ struct AllReduceFusionParams {
   float rms_eps;
   float* scale_factor;
   bool use_oneshot;
-  FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+  QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
   cudaStream_t stream;
   AllReduceFusionPattern pattern;
   bool trigger_completion_at_end = true;
diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
index 6037c1979..a777aee60 100644
--- a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
+++ b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -374,7 +374,7 @@ inline __device__ float reciprocal_approximate_ftz(float a) {
 }
 }  // namespace maths
 
-enum class FP4QuantizationSFLayout {
+enum class QuantizationSFLayout {
   // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor
   // blocks are organized in 512-byte blocks in global memory, with each block having 128x4 FP8
   // values. The SF matrix dimensions are therefore padded - rows to the nearest multiple of 128 and
@@ -475,7 +475,7 @@ template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
 __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchIdx, int rowIdx,
                                                        int colIdx, std::optional<int> numRows,
                                                        int numCols, SFType* SFout,
-                                                       FP4QuantizationSFLayout layout) {
+                                                       QuantizationSFLayout layout) {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
   static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2);
 
@@ -483,7 +483,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
   // TODO: stage through smem for packed STG.32
   // is it better than STG.8 from 4 threads ?
   if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
-    if (layout == FP4QuantizationSFLayout::SWIZZLED) {
+    if (layout == QuantizationSFLayout::SWIZZLED_128x4) {
       // SF vector index (16 elements share one SF in the K dimension).
       // numRows and numCols are unpadded.
       int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
@@ -491,7 +491,7 @@ __device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(std::optional<int> batchI
 
       auto SFOffset = get_sf_out_offset_128x4(batchIdx, mIdx, kIdx, numRows, numCols);
       return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
-    } else if (layout == FP4QuantizationSFLayout::LINEAR) {
+    } else if (layout == QuantizationSFLayout::LINEAR) {
       // Linear row-major layout, no padding required.
       int32_t KTileIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
 
@@ -679,7 +679,7 @@ struct AllReduceFusionParams {
   float rms_eps;
   // todo(review): why float* scale_factor in trt-llm?
   float scale_factor;
-  FP4QuantizationSFLayout layout = FP4QuantizationSFLayout::SWIZZLED;
+  QuantizationSFLayout layout = QuantizationSFLayout::SWIZZLED;
   cudaStream_t stream;
 
   // moe-allreduce output (non-fused)
diff --git a/tests/test_fp4_quantize.py b/tests/test_fp4_quantize.py
index a02ee3715..5bc6d0f7f 100644
--- a/tests/test_fp4_quantize.py
+++ b/tests/test_fp4_quantize.py
@@ -5,9 +5,11 @@
 from utils_fp4 import cast_from_fp4, recover_swizzled_scales, ref_fp4_quant
 
 from flashinfer import (
+    block_scale_interleave,
     e2m1_and_ufp8sf_scale_to_float,
     fp4_quantize,
-    nvfp4_block_scale_interleave,
+    mxfp4_quantize,
+    mxfp4_dequantize,
 )
 from flashinfer.utils import is_sm100a_supported
 
@@ -175,12 +177,12 @@ def test_scale_swizzling(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_nvfp4_block_scale_interleave(
+def test_block_scale_interleave(
     shape: tuple[int, int],
     seed: int,
     device: str,
 ) -> None:
-    """Test the nvfp4_block_scale_interleave function directly."""
+    """Test the block_scale_interleave function directly."""
     if not is_sm100a_supported(torch.device("cuda")):
         pytest.skip("Nvfp4 Requires compute capability of 10 or above")
     torch.set_default_device(device)
@@ -195,7 +197,7 @@ def test_nvfp4_block_scale_interleave(
     unswizzled_sf = torch.randint(0, 256, scale_shape, dtype=torch.uint8, device=device)
 
     # Test the swizzling function
-    swizzled_sf = nvfp4_block_scale_interleave(unswizzled_sf)
+    swizzled_sf = block_scale_interleave(unswizzled_sf)
 
     # Compare against the reference implementation
     ref_swizzled_sf = swizzle_sf(unswizzled_sf, m, n, sf_vec_size)
@@ -293,5 +295,20 @@ def test_e2m1_dequantization(
     )
 
 
+def test_mxfp4_quantize_roundtrip():
+    x = torch.randn((128, 64), device="cuda", dtype=torch.bfloat16) / 10
+
+    quant_a, sfs = mxfp4_quantize(x)
+    dq_a = mxfp4_dequantize(quant_a, sfs)
+
+    torch.testing.assert_close(
+        dq_a.cpu().to(torch.float32),
+        x.cpu().to(torch.float32),
+        rtol=0.3,
+        atol=0.5,
+        msg="Quantize -> dequantize mxfp4 roundtrip failed",
+    )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_groupwise_scaled_gemm_mxfp4.py b/tests/test_groupwise_scaled_gemm_mxfp4.py
index 70647d94b..e34d7feea 100644
--- a/tests/test_groupwise_scaled_gemm_mxfp4.py
+++ b/tests/test_groupwise_scaled_gemm_mxfp4.py
@@ -24,7 +24,7 @@
 
 from flashinfer.fp4_quantization import (
     _pad_scale_factors,
-    get_fp4_quantization_sm100_module,
+    get_fp4_quantization_module,
 )
 from flashinfer.gemm import group_gemm_mxfp4_nt_groupwise
 
@@ -64,7 +64,7 @@ def swizzle_blockscale(
         _pad_scale_factors(unswizzled_sf[i], m, n, sf_vec_size) for i in range(b)
     ]
     padded_input_sf = torch.stack(padded_input_sf_chunked)
-    out = get_fp4_quantization_sm100_module().nvfp4_block_scale_interleave_sm100(
+    out = get_fp4_quantization_module().nvfp4_block_scale_interleave_sm100(
         padded_input_sf
     )
     out = out.view(padded_input_sf.shape)
diff --git a/tests/test_trtllm_allreduce_fusion.py b/tests/test_trtllm_allreduce_fusion.py
index f0bef5f77..650e23a41 100644
--- a/tests/test_trtllm_allreduce_fusion.py
+++ b/tests/test_trtllm_allreduce_fusion.py
@@ -45,8 +45,8 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
             comm.AllReduceFusionPattern.kARResidualRMSNormOutFP4Quant,
         ]
         swizzled_layout_codes = [
-            comm.FP4QuantizationSFLayout.LINEAR,
-            comm.FP4QuantizationSFLayout.SWIZZLED,
+            comm.QuantizationSFLayout.LINEAR,
+            comm.QuantizationSFLayout.SWIZZLED,
         ]
         launch_with_pdls = [True, False]
         use_oneshots = [True, False, None]
@@ -122,7 +122,7 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
                                     )
                                     if (
                                         swizzled_layout_code
-                                        == comm.FP4QuantizationSFLayout.SWIZZLED
+                                        == comm.QuantizationSFLayout.SWIZZLED
                                     ):
                                         # TODO(Yingyi): check this
                                         padded_message_size = (
diff --git a/tests/test_trtllm_cutlass_fused_moe.py b/tests/test_trtllm_cutlass_fused_moe.py
index 35e24e1cb..d680e9eab 100644
--- a/tests/test_trtllm_cutlass_fused_moe.py
+++ b/tests/test_trtllm_cutlass_fused_moe.py
@@ -19,7 +19,14 @@
 from torch.nn import functional as F
 
 import flashinfer.fused_moe as fused_moe
-from flashinfer import fp4_quantize
+from flashinfer import (
+    fp4_quantize,
+    mxfp4_dequantize,
+    mxfp4_quantize,
+    mxfp8_dequantize_host,
+    mxfp8_quantize,
+    mxfp4_dequantize_host,
+)
 
 FLOAT4_E2M1_MAX = 6.0
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
@@ -162,7 +169,15 @@ def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
 
 
 def compute_with_experts(
-    num_experts, x, w31_weight, w2_weight, selected_experts, routing_weights
+    num_experts,
+    x,
+    w31_weight,
+    w2_weight,
+    selected_experts,
+    routing_weights,
+    alpha=None,
+    beta=None,
+    limit=None,
 ):
     results = torch.zeros_like(x)
     for expert_id in range(num_experts):
@@ -177,7 +192,19 @@ def compute_with_experts(
         w3_expert, w1_expert = torch.chunk(w31_expert, 2, dim=0)
 
         expert_inputs = x[batch_idx]
-        inter = F.silu(expert_inputs @ w1_expert.t()) * (expert_inputs @ w3_expert.t())
+        if alpha is not None and limit is not None and beta is not None:
+            # SwiGLUBias
+            x1 = expert_inputs @ w1_expert.t()
+            x1 = x1.clamp_(min=None, max=limit)
+            x1_scaled = x1 * torch.sigmoid(alpha * x1)
+            x2 = expert_inputs @ w3_expert.t()
+            x2 = x2.clamp_(min=-limit, max=limit) + beta
+
+            inter = x1_scaled * x2
+        else:
+            inter = F.silu(expert_inputs @ w1_expert.t()) * (
+                expert_inputs @ w3_expert.t()
+            )
         output = inter @ w2_expert.t()
         results[batch_idx] += routing_weights[batch_idx, nth_expert, None] * output
     return results.view_as(x)
@@ -334,6 +361,10 @@ def test_moe_fp8(
     [(torch.float16, torch.float8_e4m3fn), (torch.bfloat16, torch.float8_e4m3fn)],
 )
 @pytest.mark.parametrize("quantized_input", [False, True])
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="NVFP4 is only supported on SM100",
+)
 def test_moe_nvfp4(
     batch_size,
     hidden_size,
@@ -907,6 +938,10 @@ def transform_dim(a: torch.Tensor, dim: int = -1) -> torch.Tensor:
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.parametrize("top_k", TOP_K_VALUES)
 @pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="FP8 block scaling is only supported on SM100",
+)
 def test_moe_fp8_block_scaling(
     batch_size, hidden_size, num_experts, top_k, intermediate_size
 ):
@@ -1018,5 +1053,301 @@ def test_moe_fp8_block_scaling(
         )
 
 
+def quant_mxfp4_batches(a, num_experts):
+    quant_a = []
+    sfs = []
+    for i in range(num_experts):
+        a_fp4, a_sf = mxfp4_quantize(a[i].cuda())
+        quant_a.append(a_fp4)
+        sfs.append(a_sf)
+
+    result_quant_a = torch.stack(quant_a)
+    result_sfs = torch.stack(sfs)
+
+    return result_quant_a, result_sfs
+
+
+def dequant_mxfp4_batches(
+    mat_fp4: torch.Tensor,
+    scale_tensor: torch.Tensor,
+):
+    num_batches = mat_fp4.size(0)
+
+    scale_tensor = scale_tensor.view(num_batches, -1)
+
+    return torch.stack(
+        [
+            mxfp4_dequantize(mat_fp4[b, :, :], scale_tensor[b, :])
+            for b in range(num_batches)
+        ]
+    )
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.parametrize("otype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize(
+    ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
+)
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability()[0] != 10,
+    reason="MXFP8xMXFP4 is only supported on SM100",
+)
+def test_moe_mxfp8_mxfp4(
+    batch_size,
+    hidden_size,
+    num_experts,
+    top_k,
+    intermediate_size,
+    otype,
+    alpha,
+    beta,
+    limit,
+):
+    """
+    Test MoE with MXFP8 activations and MXFP4 weights.
+    Uses mxfp8_quantize for activations and fp4_quantize for weights.
+    """
+    # Skip invalid configurations
+    if top_k > num_experts:
+        pytest.skip(
+            f"top_k ({top_k}) cannot be greater than num_experts ({num_experts})"
+        )
+
+    torch.manual_seed(42)
+    e = num_experts
+    m = batch_size
+    n = intermediate_size
+    k = hidden_size
+
+    x = torch.randn(m, k, dtype=otype).cuda()
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=otype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=otype) / 10
+
+    mxfp8_x, mxfp8_x_sf = mxfp8_quantize(x, True, 32)
+
+    mxfp4_w1, mxfp4_w1_scale = quant_mxfp4_batches(w1, e)
+    mxfp4_w2, mxfp4_w2_scale = quant_mxfp4_batches(w2, e)
+
+    router_logits = torch.randn(m, e, dtype=otype).cuda()
+    routing_weights, selected_experts = compute_routing(router_logits, top_k)
+
+    fake_input_scale = torch.ones(e, device=x.device)
+
+    quant_scales = [
+        mxfp4_w1_scale.view(torch.int32),
+        fake_input_scale,
+        mxfp4_w2_scale.view(torch.int32),
+        fake_input_scale,
+    ]
+
+    flash_output = torch.zeros_like(x)
+
+    if alpha is not None and limit is not None and beta is not None:
+        alpha_t = torch.ones(e, device=x.device) * alpha
+        limit_t = torch.ones(e, device=x.device) * limit
+        beta_t = torch.ones(e, device=x.device) * beta
+    else:
+        alpha_t = None
+        limit_t = None
+        beta_t = None
+
+    # Call cutlass_fused_moe with MXFP8 activations and MXFP4 weights
+    _ = fused_moe.cutlass_fused_moe(
+        mxfp8_x,
+        selected_experts.to(torch.int),
+        routing_weights,
+        mxfp4_w1.contiguous().view(torch.long),
+        mxfp4_w2.contiguous().view(torch.long),
+        otype,
+        swiglu_alpha=alpha_t,
+        swiglu_limit=limit_t,
+        swiglu_beta=beta_t,
+        quant_scales=quant_scales,
+        input_sf=mxfp8_x_sf,
+        use_mxfp8_act_scaling=True,
+        output=flash_output,
+    )
+
+    dq_mxfp8_x = (
+        mxfp8_dequantize_host(
+            mxfp8_x.cpu().view(torch.uint8),
+            mxfp8_x_sf.cpu().view(torch.uint8).reshape(-1),
+            True,
+        )
+        .cuda()
+        .to(otype)
+    )
+
+    dq_mfxp4_w1 = (
+        dequant_mxfp4_batches(
+            mxfp4_w1.cpu().view(torch.uint8),
+            mxfp4_w1_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(otype)
+    )
+
+    dq_mfxp4_w2 = (
+        dequant_mxfp4_batches(
+            mxfp4_w2.cpu().view(torch.uint8),
+            mxfp4_w2_scale.cpu().view(torch.uint8).reshape(-1),
+        )
+        .cuda()
+        .to(otype)
+    )
+
+    # Use original weights for reference computation
+    ref_output = compute_with_experts(
+        e,
+        dq_mxfp8_x,
+        dq_mfxp4_w1,
+        dq_mfxp4_w2,
+        selected_experts,
+        routing_weights,
+        alpha,
+        beta,
+        limit,
+    )
+
+    torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
+
+
+def dequant_mxfp4_batches_host(
+    mat_fp4: torch.Tensor,
+    scale_tensor: torch.Tensor,
+):
+    return torch.stack(
+        [
+            mxfp4_dequantize_host(mat_fp4[b, :, :], scale_tensor[b, :, :])
+            for b in range(mat_fp4.size(0))
+        ]
+    )
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("top_k", TOP_K_VALUES)
+@pytest.mark.parametrize("intermediate_size", INTERMEDIATE_SIZES)
+@pytest.mark.parametrize(
+    ("alpha", "beta", "limit"), [(None, None, None), (0.5, 0.0, 7.0), (1.702, 1.0, 7.0)]
+)
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability()[0] != 9,
+    reason="BF16xMXFP4 is only supported on SM90",
+)
+def test_moe_bf16_mxfp4(
+    batch_size,
+    hidden_size,
+    num_experts,
+    top_k,
+    intermediate_size,
+    alpha,
+    beta,
+    limit,
+):
+    """
+    Test MoE with bf16 activations and MXFP4 weights.
+    Uses bf16 for activations and fp4_quantize for weights.
+    """
+    # Skip invalid configurations
+    if top_k > num_experts:
+        pytest.skip(
+            f"top_k ({top_k}) cannot be greater than num_experts ({num_experts})"
+        )
+
+    torch.manual_seed(42)
+    e = num_experts
+    m = batch_size
+    n = intermediate_size
+    k = hidden_size
+
+    x = torch.randn(m, k, dtype=torch.bfloat16).cuda()
+    w1 = torch.randint(0, 256, (e, 2 * n, k // 2), device="cuda", dtype=torch.uint8)
+    w2 = torch.randint(0, 256, (e, k, n // 2), device="cuda", dtype=torch.uint8)
+
+    w1_scale = torch.randint(
+        118, 123, (e, 2 * n, k // 32), device="cuda", dtype=torch.uint8
+    )
+    w2_scale = torch.randint(
+        118, 123, (e, k, n // 32), device="cuda", dtype=torch.uint8
+    )
+
+    router_logits = torch.randn(m, e, dtype=torch.bfloat16).cuda()
+    routing_weights, selected_experts = compute_routing(router_logits, top_k)
+
+    flash_output = torch.zeros_like(x)
+
+    if alpha is not None and limit is not None and beta is not None:
+        alpha_t = torch.ones(e, device=x.device) * alpha
+        limit_t = torch.ones(e, device=x.device) * limit
+        beta_t = torch.ones(e, device=x.device) * beta
+    else:
+        alpha_t = None
+        limit_t = None
+        beta_t = None
+
+    pad_size = hidden_size - x.shape[1]
+    x_pad = torch.nn.functional.pad(x, (0, pad_size))
+
+    quant_scales = [
+        w1_scale.view(torch.int32),
+        w2_scale.view(torch.int32),
+    ]
+
+    # Call cutlass_fused_moe with BF16 activations and MXFP4 weights
+    _ = fused_moe.cutlass_fused_moe(
+        x_pad,
+        selected_experts.to(torch.int),
+        routing_weights,
+        w1.contiguous().view(torch.uint8),
+        w2.contiguous().view(torch.uint8),
+        torch.bfloat16,
+        swiglu_alpha=alpha_t,
+        swiglu_limit=limit_t,
+        swiglu_beta=beta_t,
+        quant_scales=quant_scales,
+        use_w4_group_scaling=True,
+        output=flash_output,
+    )
+
+    dq_mfxp4_w1 = (
+        dequant_mxfp4_batches_host(
+            w1.cpu(),
+            w1_scale.cpu(),
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    dq_mfxp4_w2 = (
+        dequant_mxfp4_batches_host(
+            w2.cpu(),
+            w2_scale.cpu(),
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
+
+    # Use original weights for reference computation
+    ref_output = compute_with_experts(
+        e,
+        x,
+        dq_mfxp4_w1,
+        dq_mfxp4_w2,
+        selected_experts,
+        routing_weights,
+        alpha,
+        beta,
+        limit,
+    )
+
+    torch.testing.assert_close(ref_output, flash_output, rtol=1e-1, atol=1e-1)
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_trtllm_gen_fused_moe.py b/tests/test_trtllm_gen_fused_moe.py
index f7e7ec09b..35526ccae 100644
--- a/tests/test_trtllm_gen_fused_moe.py
+++ b/tests/test_trtllm_gen_fused_moe.py
@@ -33,7 +33,7 @@
     reorder_rows_for_gated_act_gemm,
     shuffle_matrix_a,
 )
-from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
+from flashinfer.fp4_quantization import block_scale_interleave
 from flashinfer.fused_moe import (
     WeightLayout,
     convert_to_block_layout,
@@ -463,7 +463,7 @@ def prepare_static_weights_for_kernel(
                 num_elts_per_sf=16,
             )
             gemm1_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
+                block_scale_interleave(
                     gemm1_scales_linear_fp4[i]
                     .view(torch.uint8)[
                         permute_sf_indices.to(gemm1_scales_linear_fp4.device)
@@ -490,7 +490,7 @@ def prepare_static_weights_for_kernel(
                 num_elts_per_sf=16,
             )
             gemm2_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
+                block_scale_interleave(
                     gemm2_scales_linear_fp4[i]
                     .view(torch.uint8)[
                         permute_sf_indices.to(gemm2_scales_linear_fp4.device)
diff --git a/tests/test_trtllm_moe_allreduce_fusion.py b/tests/test_trtllm_moe_allreduce_fusion.py
index 4ee9b011d..ecfbf3499 100644
--- a/tests/test_trtllm_moe_allreduce_fusion.py
+++ b/tests/test_trtllm_moe_allreduce_fusion.py
@@ -46,8 +46,8 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
         candidate_active_expert_num = [8, 12, 16]
         # candidate_active_expert_num = [1]  # debug-only
         swizzled_layout_codes = [
-            comm.FP4QuantizationSFLayout.LINEAR,
-            comm.FP4QuantizationSFLayout.SWIZZLED,
+            comm.QuantizationSFLayout.LINEAR,
+            comm.QuantizationSFLayout.SWIZZLED,
         ]
         launch_with_pdls = [True, False]
 
@@ -94,7 +94,7 @@ def _run_correctness_worker(world_size, rank, dtype, distributed_init_port):
                             )
                             if (
                                 swizzled_layout_code
-                                == comm.FP4QuantizationSFLayout.SWIZZLED
+                                == comm.QuantizationSFLayout.SWIZZLED
                             ):
                                 padded_message_size = (
                                     comm.compute_fp4_swizzled_layout_sf_size(